From 6a70a30ca1ca1f0d0397ef9e6a9817b0f6cba079 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Thu, 5 Oct 2023 21:43:19 -0700
Subject: Include a copy of PFFFT

This is a notably faster FFT implementation for 32-bit float signals, provided
under a 3-clause BSD license.
---
 common/pffft.cpp | 2021 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2021 insertions(+)
 create mode 100644 common/pffft.cpp

(limited to 'common/pffft.cpp')
diff --git a/common/pffft.cpp b/common/pffft.cpp
new file mode 100644
index 00000000..5b3b25e7
--- /dev/null
+++ b/common/pffft.cpp
@@ -0,0 +1,2021 @@
+//$ nobt
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+ * Copyright (c) 2023  Christopher Robinson
+ *
+ * Based on original fortran 77 code from FFTPACKv4 from NETLIB
+ * (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+ * of NCAR, in 1985.
+ *
+ * As confirmed by the NCAR fftpack software curators, the following
+ * FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+ * released under the same terms.
+ *
+ * FFTPACK license:
+ *
+ * http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+ *
+ * Copyright (c) 2004 the University Corporation for Atmospheric
+ * Research ("UCAR"). All rights reserved. Developed by NCAR's
+ * Computational and Information Systems Laboratory, UCAR,
+ * www.cisl.ucar.edu.
+ *
+ * Redistribution and use of the Software in source and binary forms,
+ * with or without modification, is permitted provided that the
+ * following conditions are met:
+ *
+ * - Neither the names of NCAR's Computational and Information Systems
+ * Laboratory, the University Corporation for Atmospheric Research,
+ * nor the names of its sponsors or contributors may be used to
+ * endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notices, this list of conditions, and the disclaimer below.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the disclaimer below in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+ * SOFTWARE.
+ *
+ *
+ * PFFFT : a Pretty Fast FFT.
+ *
+ * This file is largerly based on the original FFTPACK implementation, modified
+ * in order to take advantage of SIMD instructions of modern CPUs.
+ */
+
+#include "pffft.h"
+
+#include <assert.h>
+#include <cmath>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "almalloc.h"
+#include "alnumbers.h"
+
+#if defined(__GNUC__)
+#define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#define RESTRICT __restrict
+
+#elif defined(_MSC_VER)
+
+#define ALWAYS_INLINE(return_type) __forceinline return_type
+#define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#define RESTRICT __restrict
+#endif
+
+
+/*
+ * vector support macros: the rest of the code is independant of
+ * SSE/Altivec/NEON -- adding support for other platforms with 4-element
+ * vectors should be limited to these macros
+ */
+
+// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
+//#define PFFFT_SIMD_DISABLE
+
+#ifndef PFFFT_SIMD_DISABLE
+/*
+ * Altivec support macros
+ */
+#if defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)
+typedef vector float v4sf;
+#define SIMD_SZ 4
+#define VZERO() ((vector float) vec_splat_u8(0))
+#define VMUL(a,b) vec_madd(a,b, VZERO())
+#define VADD(a,b) vec_add(a,b)
+#define VMADD(a,b,c) vec_madd(a,b,c)
+#define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+#define LD_PS1(p) ld_ps1(&p)
+#define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } while(0)
+#define UNINTERLEAVE2(in1, in2, out1, out2) do {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+} while(0)
+#define VTRANSPOSE4(x0,x1,x2,x3) do {           \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+} while(0)
+#define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
+
+/*
+ * SSE1 support macros
+ */
+#elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X86) || \
+    (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+#define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
+#define VZERO() _mm_setzero_ps()
+#define VMUL(a,b) _mm_mul_ps(a,b)
+#define VADD(a,b) _mm_add_ps(a,b)
+#define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#define VSUB(a,b) _mm_sub_ps(a,b)
+#define LD_PS1(p) _mm_set1_ps(p)
+#define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } while(0)
+#define UNINTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } while(0)
+#define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
+
+/*
+ * ARM NEON support macros
+ */
+#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64)
+
+#include <arm_neon.h>
+typedef float32x4_t v4sf;
+#define SIMD_SZ 4
+#define VZERO() vdupq_n_f32(0)
+#define VMUL(a,b) vmulq_f32(a,b)
+#define VADD(a,b) vaddq_f32(a,b)
+#define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#define VSUB(a,b) vsubq_f32(a,b)
+#define LD_PS1(p) vld1q_dup_f32(&(p))
+#define INTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
+#define UNINTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
+#define VTRANSPOSE4(x0,x1,x2,x3) do {                                   \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+} while(0)
+// marginally faster version
+//#define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+#define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
+
+#else
+
+#warning "building with simd disabled !\n";
+#define PFFFT_SIMD_DISABLE // fallback to scalar code
+#endif
+
+#endif /* PFFFT_SIMD_DISABLE */
+
+// fallback mode for situations where SIMD is not available, use scalar mode instead
+#ifdef PFFFT_SIMD_DISABLE
+typedef float v4sf;
+#define SIMD_SZ 1
+#define VZERO() 0.f
+#define VMUL(a,b) ((a)*(b))
+#define VADD(a,b) ((a)+(b))
+#define VMADD(a,b,c) ((a)*(b)+(c))
+#define VSUB(a,b) ((a)-(b))
+#define LD_PS1(p) (p)
+#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
+#endif
+
+// shortcuts for complex multiplcations
+#define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } while(0)
+#define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } while(0)
+#ifndef SVMUL
+// multiply a scalar with a vector
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#if !defined(PFFFT_SIMD_DISABLE)
+/* TODO: Remove this, type-punning to access individual SIMD values is bad. */
+typedef union v4sf_union {
+    v4sf  v;
+    float f[4];
+} v4sf_union;
+
+#include <string.h>
+
+#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+
+/* detect bugs with the vector support macros */
+void validate_pffft_simd()
+{
+    float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+    v4sf_union a0, a1, a2, a3, t, u;
+    memcpy(a0.f, f, 4*sizeof(float));
+    memcpy(a1.f, f+4, 4*sizeof(float));
+    memcpy(a2.f, f+8, 4*sizeof(float));
+    memcpy(a3.f, f+12, 4*sizeof(float));
+
+    t = a0; u = a1; t.v = VZERO();
+    printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
+    t.v = VADD(a1.v, a2.v);
+    printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
+    t.v = VMUL(a1.v, a2.v);
+    printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
+    t.v = VMADD(a1.v, a2.v,a0.v);
+    printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
+
+    INTERLEAVE2(a1.v,a2.v,t.v,u.v);
+    printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+    assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
+    UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
+    printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+    assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
+
+    t.v=LD_PS1(f[15]);
+    printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+    assertv4(t, 15, 15, 15, 15);
+    t.v = VSWAPHL(a1.v, a2.v);
+    printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+    assertv4(t, 8, 9, 6, 7);
+    VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+    printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
+          a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
+          a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
+    assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+}
+#endif //!PFFFT_SIMD_DISABLE
+
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
+
+void *pffft_aligned_malloc(size_t nb_bytes)
+{ return al_malloc(MALLOC_V4SF_ALIGNMENT, nb_bytes); }
+
+void pffft_aligned_free(void *p) { al_free(p); }
+
+int pffft_simd_size() { return SIMD_SZ; }
+
+/*
+  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
+*/
+static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign)
+{
+    const int l1ido = l1*ido;
+    if(ido <= 2)
+    {
+        for(int k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido)
+        {
+            ch[0]         = VADD(cc[0], cc[ido+0]);
+            ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
+            ch[1]         = VADD(cc[1], cc[ido+1]);
+            ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
+        }
+    }
+    else
+    {
+        for(int k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido)
+        {
+            for(int i=0; i<ido-1; i+=2)
+            {
+                v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
+                v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
+                v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
+                ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
+                ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
+                VCPLXMUL(tr2, ti2, wr, wi);
+                ch[i+l1ido]   = tr2;
+                ch[i+l1ido+1] = ti2;
+            }
+        }
+    }
+}
+
+/*
+  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
+*/
+static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
+    const float *wa2, float fsign)
+{
+    static constexpr float taur = -0.5f;
+    const float taui = 0.866025403784439f*fsign;
+    const int l1ido = l1*ido;
+    assert(ido > 2);
+    for(int k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido)
+    {
+        for(int i=0; i<ido-1; i+=2)
+        {
+            v4sf tr2 = VADD(cc[i+ido], cc[i+2*ido]);
+            v4sf cr2 = VADD(cc[i], SVMUL(taur,tr2));
+            ch[i]    = VADD(cc[i], tr2);
+            v4sf ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
+            v4sf ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
+            ch[i+1]  = VADD(cc[i+1], ti2);
+            v4sf cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
+            v4sf ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
+            v4sf dr2 = VSUB(cr2, ci3);
+            v4sf dr3 = VADD(cr2, ci3);
+            v4sf di2 = VADD(ci2, cr3);
+            v4sf di3 = VSUB(ci2, cr3);
+            float wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+            VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+            ch[i+l1ido] = dr2;
+            ch[i+l1ido + 1] = di2;
+            VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+            ch[i+2*l1ido] = dr3;
+            ch[i+2*l1ido+1] = di3;
+        }
+    }
+} /* passf3 */
+
+static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
+    const float *wa2, const float *wa3, float fsign)
+{
+    /* isign == -1 for forward transform and +1 for backward transform */
+    const int l1ido = l1*ido;
+    if(ido == 2)
+    {
+        for(int k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido)
+        {
+            v4sf tr1 = VSUB(cc[0], cc[2*ido + 0]);
+            v4sf tr2 = VADD(cc[0], cc[2*ido + 0]);
+            v4sf ti1 = VSUB(cc[1], cc[2*ido + 1]);
+            v4sf ti2 = VADD(cc[1], cc[2*ido + 1]);
+            v4sf ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
+            v4sf tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
+            v4sf tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
+            v4sf ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
+
+            ch[0*l1ido + 0] = VADD(tr2, tr3);
+            ch[0*l1ido + 1] = VADD(ti2, ti3);
+            ch[1*l1ido + 0] = VADD(tr1, tr4);
+            ch[1*l1ido + 1] = VADD(ti1, ti4);
+            ch[2*l1ido + 0] = VSUB(tr2, tr3);
+            ch[2*l1ido + 1] = VSUB(ti2, ti3);
+            ch[3*l1ido + 0] = VSUB(tr1, tr4);
+            ch[3*l1ido + 1] = VSUB(ti1, ti4);
+        }
+    }
+    else
+    {
+        for(int k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido)
+        {
+            for(int i=0; i<ido-1; i+=2)
+            {
+                v4sf tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
+                v4sf tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
+                v4sf ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
+                v4sf ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
+                v4sf tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
+                v4sf ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
+                v4sf tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
+                v4sf ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
+
+                ch[i] = VADD(tr2, tr3);
+                v4sf cr3 = VSUB(tr2, tr3);
+                ch[i + 1] = VADD(ti2, ti3);
+                v4sf ci3 = VSUB(ti2, ti3);
+
+                v4sf cr2 = VADD(tr1, tr4);
+                v4sf cr4 = VSUB(tr1, tr4);
+                v4sf ci2 = VADD(ti1, ti4);
+                v4sf ci4 = VSUB(ti1, ti4);
+                float wr1=wa1[i], wi1=fsign*wa1[i+1];
+                VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+                float wr2=wa2[i], wi2=fsign*wa2[i+1];
+                ch[i + l1ido] = cr2;
+                ch[i + l1ido + 1] = ci2;
+
+                VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+                float wr3=wa3[i], wi3=fsign*wa3[i+1];
+                ch[i + 2*l1ido] = cr3;
+                ch[i + 2*l1ido + 1] = ci3;
+
+                VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+                ch[i + 3*l1ido] = cr4;
+                ch[i + 3*l1ido + 1] = ci4;
+            }
+        }
+    }
+} /* passf4 */
+
+/*
+ * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
+ */
+static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
+    const float *wa2, const float *wa3, const float *wa4, float fsign)
+{
+    static constexpr float tr11 = 0.309016994374947f;
+    static constexpr float tr12 = -0.809016994374947f;
+    const float ti11 = 0.951056516295154f*fsign;
+    const float ti12 = 0.587785252292473f*fsign;
+
+#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + (a_1) + 1]
+#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + (a_1) + 1]
+
+    assert(ido > 2);
+    for(int k = 0; k < l1; ++k, cc += 5*ido, ch += ido)
+    {
+        for(int i = 0; i < ido-1; i += 2)
+        {
+            v4sf ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
+            v4sf ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
+            v4sf ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
+            v4sf ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
+            v4sf tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
+            v4sf tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
+            v4sf tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
+            v4sf tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
+            ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
+            ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
+            v4sf cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
+            v4sf ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
+            v4sf cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
+            v4sf ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
+            v4sf cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+            v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+            v4sf cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+            v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+            v4sf dr3 = VSUB(cr3, ci4);
+            v4sf dr4 = VADD(cr3, ci4);
+            v4sf di3 = VADD(ci3, cr4);
+            v4sf di4 = VSUB(ci3, cr4);
+            v4sf dr5 = VADD(cr2, ci5);
+            v4sf dr2 = VSUB(cr2, ci5);
+            v4sf di5 = VSUB(ci2, cr5);
+            v4sf di2 = VADD(ci2, cr5);
+            float wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+            float wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
+            VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+            ch_ref(i - 1, 2) = dr2;
+            ch_ref(i, 2)     = di2;
+            VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+            ch_ref(i - 1, 3) = dr3;
+            ch_ref(i, 3)     = di3;
+            VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+            ch_ref(i - 1, 4) = dr4;
+            ch_ref(i, 4)     = di4;
+            VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+            ch_ref(i - 1, 5) = dr5;
+            ch_ref(i, 5)     = di5;
+        }
+    }
+#undef ch_ref
+#undef cc_ref
+}
+
+static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+    const float *wa1)
+{
+    static constexpr float minus_one = -1.f;
+    const int l1ido = l1*ido;
+
+    for(int k=0; k < l1ido; k += ido)
+    {
+        v4sf a = cc[k], b = cc[k + l1ido];
+        ch[2*k] = VADD(a, b);
+        ch[2*(k+ido)-1] = VSUB(a, b);
+    }
+    if(ido < 2)
+        return;
+    if(ido != 2)
+    {
+        for(int k=0; k < l1ido; k += ido)
+        {
+            for(int i=2; i<ido; i+=2)
+            {
+                v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
+                v4sf br = cc[i - 1 + k], bi = cc[i + k];
+                VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+                ch[i + 2*k] = VADD(bi, ti2);
+                ch[2*(k+ido) - i] = VSUB(ti2, bi);
+                ch[i - 1 + 2*k] = VADD(br, tr2);
+                ch[2*(k+ido) - i -1] = VSUB(br, tr2);
+            }
+        }
+        if((ido&1) == 1)
+            return;
+    }
+    for(int k=0; k < l1ido; k += ido)
+    {
+        ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
+        ch[2*k + ido-1] = cc[k + ido-1];
+    }
+} /* radf2 */
+
+
+static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1)
+{
+    static constexpr float minus_two=-2;
+    const int l1ido = l1*ido;
+    for(int k=0; k < l1ido; k += ido)
+    {
+        v4sf a = cc[2*k];
+        v4sf b = cc[2*(k+ido) - 1];
+        ch[k] = VADD(a, b);
+        ch[k + l1ido] =VSUB(a, b);
+    }
+    if(ido < 2)
+        return;
+    if(ido != 2)
+    {
+        for(int k = 0; k < l1ido; k += ido)
+        {
+            for(int i = 2; i < ido; i += 2)
+            {
+                v4sf a = cc[i-1 + 2*k];
+                v4sf b = cc[2*(k + ido) - i - 1];
+                v4sf c = cc[i+0 + 2*k];
+                v4sf d = cc[2*(k + ido) - i + 0];
+                ch[i-1 + k] = VADD(a, b);
+                v4sf tr2 = VSUB(a, b);
+                ch[i+0 + k] = VSUB(c, d);
+                v4sf ti2 = VADD(c, d);
+                VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+                ch[i-1 + k + l1ido] = tr2;
+                ch[i+0 + k + l1ido] = ti2;
+            }
+        }
+        if((ido&1) == 1)
+            return;
+    }
+    for(int k = 0; k < l1ido; k += ido)
+    {
+        v4sf a = cc[2*k + ido-1];
+        v4sf b = cc[2*k + ido];
+        ch[k + ido-1] = VADD(a,a);
+        ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
+    }
+} /* radb2 */
+
+static void radf3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
+    const float *wa2)
+{
+    static constexpr float taur = -0.5f;
+    static constexpr float taui = 0.866025403784439f;
+    for(int k=0; k<l1; k++)
+    {
+        v4sf cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
+        ch[3*k*ido] = VADD(cc[k*ido], cr2);
+        ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+        ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
+    }
+    if(ido == 1)
+        return;
+    for(int k=0; k<l1; k++)
+    {
+        for(int i=2; i<ido; i+=2)
+        {
+            const int ic = ido - i;
+            v4sf wr1 = LD_PS1(wa1[i - 2]);
+            v4sf wi1 = LD_PS1(wa1[i - 1]);
+            v4sf dr2 = cc[i - 1 + (k + l1)*ido];
+            v4sf di2 = cc[i + (k + l1)*ido];
+            VCPLXMULCONJ(dr2, di2, wr1, wi1);
+
+            v4sf wr2 = LD_PS1(wa2[i - 2]);
+            v4sf wi2 = LD_PS1(wa2[i - 1]);
+            v4sf dr3 = cc[i - 1 + (k + l1*2)*ido];
+            v4sf di3 = cc[i + (k + l1*2)*ido];
+            VCPLXMULCONJ(dr3, di3, wr2, wi2);
+
+            v4sf cr2 = VADD(dr2, dr3);
+            v4sf ci2 = VADD(di2, di3);
+            ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
+            ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
+            v4sf tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
+            v4sf ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
+            v4sf tr3 = SVMUL(taui, VSUB(di2, di3));
+            v4sf ti3 = SVMUL(taui, VSUB(dr3, dr2));
+            ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
+            ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
+            ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
+            ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
+        }
+    }
+} /* radf3 */
+
+
+static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
+    const float *wa2)
+{
+    static constexpr float taur = -0.5f;
+    static constexpr float taui = 0.866025403784439f;
+    static constexpr float taui_2 = taui*2.0f;
+
+    for(int k=0; k<l1; k++)
+    {
+        v4sf tr2 = cc[ido-1 + (3*k + 1)*ido];
+        tr2 = VADD(tr2,tr2);
+        v4sf cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
+        ch[k*ido] = VADD(cc[3*k*ido], tr2);
+        v4sf ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
+        ch[(k + l1)*ido] = VSUB(cr2, ci3);
+        ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
+    }
+    if(ido == 1)
+        return;
+    for(int k=0; k<l1; k++)
+    {
+        for(int i=2; i<ido; i+=2)
+        {
+            const int ic = ido - i;
+            v4sf tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
+            v4sf cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
+            ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
+            v4sf ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
+            v4sf ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
+            ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
+            v4sf cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
+            v4sf ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
+            v4sf dr2 = VSUB(cr2, ci3);
+            v4sf dr3 = VADD(cr2, ci3);
+            v4sf di2 = VADD(ci2, cr3);
+            v4sf di3 = VSUB(ci2, cr3);
+            VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+            ch[i - 1 + (k + l1)*ido] = dr2;
+            ch[i + (k + l1)*ido] = di2;
+            VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+            ch[i - 1 + (k + 2*l1)*ido] = dr3;
+            ch[i + (k + 2*l1)*ido] = di3;
+        }
+    }
+} /* radb3 */
+
+static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+        const float *RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
+{
+    static constexpr float minus_hsqt2 = al::numbers::sqrt2_v<float> * -0.5f;
+    const int l1ido = l1*ido;
+    {
+        const v4sf *RESTRICT cc_ = cc, *RESTRICT cc_end = cc + l1ido;
+        v4sf *RESTRICT ch_ = ch;
+        while(cc != cc_end)
+        {
+            // this loop represents between 25% and 40% of total radf4_ps cost !
+            v4sf a0 = cc[0], a1 = cc[l1ido];
+            v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
+            v4sf tr1 = VADD(a1, a3);
+            v4sf tr2 = VADD(a0, a2);
+            ch[2*ido-1] = VSUB(a0, a2);
+            ch[2*ido  ] = VSUB(a3, a1);
+            ch[0      ] = VADD(tr1, tr2);
+            ch[4*ido-1] = VSUB(tr2, tr1);
+            cc += ido; ch += 4*ido;
+        }
+        cc = cc_;
+        ch = ch_;
+    }
+    if(ido < 2)
+        return;
+    if(ido != 2)
+    {
+        for(int k = 0; k < l1ido; k += ido)
+        {
+            const v4sf *RESTRICT pc = cc + 1 + k;
+            for(int i=2; i<ido; i += 2, pc += 2)
+            {
+                const int ic = ido - i;
+                v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
+                v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
+
+                cr2 = pc[1*l1ido+0];
+                ci2 = pc[1*l1ido+1];
+                wr=LD_PS1(wa1[i - 2]);
+                wi=LD_PS1(wa1[i - 1]);
+                VCPLXMULCONJ(cr2,ci2,wr,wi);
+
+                cr3 = pc[2*l1ido+0];
+                ci3 = pc[2*l1ido+1];
+                wr = LD_PS1(wa2[i-2]);
+                wi = LD_PS1(wa2[i-1]);
+                VCPLXMULCONJ(cr3, ci3, wr, wi);
+
+                cr4 = pc[3*l1ido];
+                ci4 = pc[3*l1ido+1];
+                wr = LD_PS1(wa3[i-2]);
+                wi = LD_PS1(wa3[i-1]);
+                VCPLXMULCONJ(cr4, ci4, wr, wi);
+
+                /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
+
+                tr1 = VADD(cr2,cr4);
+                tr4 = VSUB(cr4,cr2);
+                tr2 = VADD(pc[0],cr3);
+                tr3 = VSUB(pc[0],cr3);
+                ch[i - 1 + 4*k] = VADD(tr1,tr2);
+                ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
+                ti1 = VADD(ci2,ci4);
+                ti4 = VSUB(ci2,ci4);
+                ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
+                ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
+                ti2 = VADD(pc[1],ci3);
+                ti3 = VSUB(pc[1],ci3);
+                ch[i + 4*k] = VADD(ti1, ti2);
+                ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
+                ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
+                ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
+            }
+        }
+        if((ido&1) == 1)
+            return;
+    }
+    for(int k=0; k<l1ido; k += ido)
+    {
+        v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
+        v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
+        v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
+        v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
+        ch[ido-1 + 4*k] = VADD(tr1, c);
+        ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
+        ch[4*k + 1*ido] = VSUB(ti1, d);
+        ch[4*k + 3*ido] = VADD(ti1, d);
+    }
+} /* radf4 */
+
+
+static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf *RESTRICT ch,
+    const float *RESTRICT wa1, const float *RESTRICT wa2, const float *RESTRICT wa3)
+{
+    static constexpr float minus_sqrt2 = -1.414213562373095f;
+    static constexpr float two = 2.f;
+    const int l1ido = l1*ido;
+    {
+        const v4sf *RESTRICT cc_ = cc, *RESTRICT ch_end = ch + l1ido;
+        v4sf *ch_ = ch;
+        while(ch != ch_end)
+        {
+            v4sf a = cc[0], b = cc[4*ido-1];
+            v4sf c = cc[2*ido], d = cc[2*ido-1];
+            v4sf tr3 = SVMUL(two,d);
+            v4sf tr2 = VADD(a,b);
+            v4sf tr1 = VSUB(a,b);
+            v4sf tr4 = SVMUL(two,c);
+            ch[0*l1ido] = VADD(tr2, tr3);
+            ch[2*l1ido] = VSUB(tr2, tr3);
+            ch[1*l1ido] = VSUB(tr1, tr4);
+            ch[3*l1ido] = VADD(tr1, tr4);
+
+            cc += 4*ido; ch += ido;
+        }
+        cc = cc_; ch = ch_;
+    }
+    if(ido < 2)
+        return;
+    if(ido != 2)
+    {
+        for(int k = 0; k < l1ido; k += ido)
+        {
+            const v4sf *RESTRICT pc = cc - 1 + 4*k;
+            v4sf *RESTRICT ph = ch + k + 1;
+            for(int i = 2; i < ido; i += 2)
+            {
+                v4sf tr1 = VSUB(pc[i], pc[4*ido - i]);
+                v4sf tr2 = VADD(pc[i], pc[4*ido - i]);
+                v4sf ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
+                v4sf tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
+                ph[0] = VADD(tr2, tr3);
+                v4sf cr3 = VSUB(tr2, tr3);
+
+                v4sf ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+                v4sf tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+                v4sf cr2 = VSUB(tr1, tr4);
+                v4sf cr4 = VADD(tr1, tr4);
+
+                v4sf ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
+                v4sf ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
+
+                ph[1] = VADD(ti2, ti3); ph += l1ido;
+                v4sf ci3 = VSUB(ti2, ti3);
+                v4sf ci2 = VADD(ti1, ti4);
+                v4sf ci4 = VSUB(ti1, ti4);
+                VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+                ph[0] = cr2;
+                ph[1] = ci2; ph += l1ido;
+                VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+                ph[0] = cr3;
+                ph[1] = ci3; ph += l1ido;
+                VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
+                ph[0] = cr4;
+                ph[1] = ci4; ph = ph - 3*l1ido + 2;
+            }
+        }
+        if((ido&1) == 1)
+            return;
+    }
+    for(int k=0; k < l1ido; k+=ido)
+    {
+        const int i0 = 4*k + ido;
+        v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
+        v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
+        v4sf tr1 = VSUB(c,d);
+        v4sf tr2 = VADD(c,d);
+        v4sf ti1 = VADD(b,a);
+        v4sf ti2 = VSUB(b,a);
+        ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
+        ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
+        ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
+        ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
+    }
+} /* radb4 */
+
+static void radf5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
+    const float *wa2, const float *wa3, const float *wa4)
+{
+    static constexpr float tr11 = 0.309016994374947f;
+    static constexpr float ti11 = 0.951056516295154f;
+    static constexpr float tr12 = -0.809016994374947f;
+    static constexpr float ti12 = 0.587785252292473f;
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
+
+    /* Parameter adjustments */
+    const int ch_offset = 1 + ido * 6;
+    ch -= ch_offset;
+    const int cc_offset = 1 + ido * (1 + l1);
+    cc -= cc_offset;
+
+    /* Function Body */
+    for(int k = 1; k <= l1; ++k)
+    {
+        v4sf cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
+        v4sf ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
+        v4sf cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
+        v4sf ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
+        ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
+        ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+        ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+        ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+        ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+        //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
+    }
+    if(ido == 1)
+      return;
+
+    const int idp2 = ido + 2;
+    for(int k = 1; k <= l1; ++k)
+    {
+        for(int i = 3; i <= ido; i += 2)
+        {
+            const int ic = idp2 - i;
+            v4sf dr2 = LD_PS1(wa1[i-3]);
+            v4sf di2 = LD_PS1(wa1[i-2]);
+            v4sf dr3 = LD_PS1(wa2[i-3]);
+            v4sf di3 = LD_PS1(wa2[i-2]);
+            v4sf dr4 = LD_PS1(wa3[i-3]);
+            v4sf di4 = LD_PS1(wa3[i-2]);
+            v4sf dr5 = LD_PS1(wa4[i-3]);
+            v4sf di5 = LD_PS1(wa4[i-2]);
+            VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+            VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+            VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+            VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+            v4sf cr2 = VADD(dr2, dr5);
+            v4sf ci5 = VSUB(dr5, dr2);
+            v4sf cr5 = VSUB(di2, di5);
+            v4sf ci2 = VADD(di2, di5);
+            v4sf cr3 = VADD(dr3, dr4);
+            v4sf ci4 = VSUB(dr4, dr3);
+            v4sf cr4 = VSUB(di3, di4);
+            v4sf ci3 = VADD(di3, di4);
+            ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
+            ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
+            v4sf tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+            v4sf ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
+            v4sf tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+            v4sf ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
+            v4sf tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
+            v4sf ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+            v4sf tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
+            v4sf ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+            ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+            ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
+            ch_ref(i, 3, k) = VADD(ti2, ti5);
+            ch_ref(ic, 2, k) = VSUB(ti5, ti2);
+            ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+            ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
+            ch_ref(i, 5, k) = VADD(ti3, ti4);
+            ch_ref(ic, 4, k) = VSUB(ti4, ti3);
+        }
+    }
+#undef cc_ref
+#undef ch_ref
+} /* radf5 */
+
+static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
+    const float *wa2, const float *wa3, const float *wa4)
+{
+    static constexpr float tr11 = 0.309016994374947f;
+    static constexpr float ti11 = 0.951056516295154f;
+    static constexpr float tr12 = -0.809016994374947f;
+    static constexpr float ti12 = 0.587785252292473f;
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+    /* Parameter adjustments */
+    const int ch_offset = 1 + ido * (1 + l1);
+    ch -= ch_offset;
+    const int cc_offset = 1 + ido * 6;
+    cc -= cc_offset;
+
+    /* Function Body */
+    for(int k = 1; k <= l1; ++k)
+    {
+        v4sf ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
+        v4sf ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
+        v4sf tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
+        v4sf tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
+        ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
+        v4sf cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+        v4sf cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+        v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+        v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+        ch_ref(1, k, 2) = VSUB(cr2, ci5);
+        ch_ref(1, k, 3) = VSUB(cr3, ci4);
+        ch_ref(1, k, 4) = VADD(cr3, ci4);
+        ch_ref(1, k, 5) = VADD(cr2, ci5);
+    }
+    if(ido == 1)
+        return;
+
+    const int idp2 = ido + 2;
+    for(int k = 1; k <= l1; ++k)
+    {
+        for(int i = 3; i <= ido; i += 2)
+        {
+            const int ic = idp2 - i;
+            v4sf ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+            v4sf ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+            v4sf ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+            v4sf ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+            v4sf tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+            v4sf tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+            v4sf tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+            v4sf tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+            ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
+            ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
+            v4sf cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+            v4sf ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
+            v4sf cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+            v4sf ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
+            v4sf cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+            v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+            v4sf cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+            v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+            v4sf dr3 = VSUB(cr3, ci4);
+            v4sf dr4 = VADD(cr3, ci4);
+            v4sf di3 = VADD(ci3, cr4);
+            v4sf di4 = VSUB(ci3, cr4);
+            v4sf dr5 = VADD(cr2, ci5);
+            v4sf dr2 = VSUB(cr2, ci5);
+            v4sf di5 = VSUB(ci2, cr5);
+            v4sf di2 = VADD(ci2, cr5);
+            VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+            VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+            VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+            VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
+
+            ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
+            ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
+            ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
+            ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
+        }
+    }
+#undef cc_ref
+#undef ch_ref
+} /* radb5 */
+
+static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+    const float *wa, const int *ifac)
+{
+    const v4sf *in  = input_readonly;
+    v4sf *out = (in == work2 ? work1 : work2);
+    const int nf = ifac[1];
+    int l2 = n;
+    int iw = n-1;
+    assert(in != out && work1 != work2);
+    for(int k1 = 1; k1 <= nf; ++k1)
+    {
+        int kh = nf - k1;
+        int ip = ifac[kh + 2];
+        int l1 = l2 / ip;
+        int ido = n / l2;
+        iw -= (ip - 1)*ido;
+        switch (ip)
+        {
+            case 5: {
+                int ix2 = iw + ido;
+                int ix3 = ix2 + ido;
+                int ix4 = ix3 + ido;
+                radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+            }   break;
+            case 4: {
+                int ix2 = iw + ido;
+                int ix3 = ix2 + ido;
+                radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+            }   break;
+            case 3: {
+                int ix2 = iw + ido;
+                radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+            }   break;
+            case 2:
+                radf2_ps(ido, l1, in, out, &wa[iw]);
+                break;
+            default:
+                assert(0);
+                break;
+        }
+        l2 = l1;
+        if(out == work2)
+        {
+            out = work1;
+            in = work2;
+        }
+        else
+        {
+            out = work2;
+            in = work1;
+        }
+    }
+    return const_cast<v4sf*>(in); /* this is in fact the output .. */
+} /* rfftf1 */
+
+static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+    const float *wa, const int *ifac)
+{
+    const v4sf *in = input_readonly;
+    v4sf *out = (in == work2 ? work1 : work2);
+    const int nf = ifac[1];
+    int l1 = 1;
+    int iw = 0;
+    assert(in != out);
+    for(int k1=1; k1<=nf; k1++)
+    {
+        int ip = ifac[k1 + 1];
+        int l2 = ip*l1;
+        int ido = n / l2;
+        switch(ip)
+        {
+            case 5: {
+                int ix2 = iw + ido;
+                int ix3 = ix2 + ido;
+                int ix4 = ix3 + ido;
+                radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+            }   break;
+            case 4: {
+                int ix2 = iw + ido;
+                int ix3 = ix2 + ido;
+                radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+            }   break;
+            case 3: {
+                int ix2 = iw + ido;
+                radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+            }   break;
+            case 2:
+                radb2_ps(ido, l1, in, out, &wa[iw]);
+                break;
+            default:
+                assert(0);
+                break;
+        }
+        l1 = l2;
+        iw += (ip - 1)*ido;
+
+        if(out == work2)
+        {
+            out = work1;
+            in = work2;
+        }
+        else
+        {
+            out = work2;
+            in = work1;
+        }
+    }
+    return const_cast<v4sf*>(in); /* this is in fact the output .. */
+}
+
+static int decompose(int n, int *ifac, const int *ntryh)
+{
+    int nl = n, nf = 0;
+    for(int j=0; ntryh[j]; ++j)
+    {
+        const int ntry = ntryh[j];
+        while(nl != 1)
+        {
+            int nq = nl / ntry;
+            int nr = nl - ntry*nq;
+            if(nr == 0)
+            {
+                ifac[2+nf++] = ntry;
+                nl = nq;
+                if(ntry == 2 && nf != 1)
+                {
+                    for(int i = 2; i <= nf; ++i)
+                    {
+                        int ib = nf - i + 2;
+                        ifac[ib + 1] = ifac[ib];
+                    }
+                    ifac[2] = 2;
+                }
+            }
+            else
+                break;
+        }
+    }
+    ifac[0] = n;
+    ifac[1] = nf;
+    return nf;
+}
+
+
+
+static void rffti1_ps(int n, float *wa, int *ifac)
+{
+    static constexpr int ntryh[] = { 4,2,3,5,0 };
+
+    const int nf = decompose(n,ifac,ntryh);
+    const double argh = 2.0*al::numbers::pi / n;
+    int is = 0;
+    int nfm1 = nf - 1;
+    int l1 = 1;
+    for(int k1 = 1; k1 <= nfm1; k1++)
+    {
+        int ip = ifac[k1 + 1];
+        int ld = 0;
+        int l2 = l1*ip;
+        int ido = n / l2;
+        int ipm = ip - 1;
+        for(int j = 1; j <= ipm; ++j)
+        {
+            int i = is, fi=0;
+            ld += l1;
+            double argld = ld*argh;
+            for(int ii = 3; ii <= ido; ii += 2)
+            {
+                i += 2;
+                fi += 1;
+                wa[i - 2] = static_cast<float>(std::cos(fi*argld));
+                wa[i - 1] = static_cast<float>(std::sin(fi*argld));
+            }
+            is += ido;
+        }
+        l1 = l2;
+    }
+} /* rffti1 */
+
+void cffti1_ps(int n, float *wa, int *ifac)
+{
+    static constexpr int ntryh[] = { 5,3,4,2,0 };
+
+    const int nf = decompose(n,ifac,ntryh);
+    const double argh = 2.0*al::numbers::pi / n;
+    int i = 1;
+    int l1 = 1;
+    for(int k1=1; k1<=nf; k1++)
+    {
+        int ip = ifac[k1+1];
+        int ld = 0;
+        int l2 = l1*ip;
+        int ido = n / l2;
+        int idot = ido + ido + 2;
+        int ipm = ip - 1;
+        for(int j=1; j<=ipm; j++)
+        {
+            int i1 = i, fi = 0;
+            wa[i-1] = 1;
+            wa[i] = 0;
+            ld += l1;
+            double argld = ld*argh;
+            for(int ii = 4; ii <= idot; ii += 2)
+            {
+                i += 2;
+                fi += 1;
+                wa[i-1] = static_cast<float>(std::cos(fi*argld));
+                wa[i]   = static_cast<float>(std::sin(fi*argld));
+            }
+            if(ip > 5)
+            {
+                wa[i1-1] = wa[i-1];
+                wa[i1] = wa[i];
+            }
+        }
+        l1 = l2;
+    }
+} /* cffti1 */
+
+
+v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa,
+    const int *ifac, float fsign)
+{
+    const v4sf *in = input_readonly;
+    v4sf *out = (in == work2 ? work1 : work2);
+    const int nf = ifac[1];
+    int l1 = 1;
+    int iw = 0;
+    assert(in != out && work1 != work2);
+    for(int k1=2; k1<=nf+1; k1++)
+    {
+        int ip = ifac[k1];
+        int l2 = ip*l1;
+        int ido = n / l2;
+        int idot = ido + ido;
+        switch(ip)
+        {
+            case 5: {
+                int ix2 = iw + idot;
+                int ix3 = ix2 + idot;
+                int ix4 = ix3 + idot;
+                passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], fsign);
+            }   break;
+            case 4: {
+                int ix2 = iw + idot;
+                int ix3 = ix2 + idot;
+                passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], fsign);
+            }   break;
+            case 2:
+                passf2_ps(idot, l1, in, out, &wa[iw], fsign);
+                break;
+            case 3: {
+                int ix2 = iw + idot;
+                passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], fsign);
+            }   break;
+            default:
+                assert(0);
+        }
+        l1 = l2;
+        iw += (ip - 1)*idot;
+        if(out == work2)
+        {
+            out = work1;
+            in = work2;
+        }
+        else
+        {
+            out = work2;
+            in = work1;
+        }
+    }
+
+    return const_cast<v4sf*>(in); /* this is in fact the output .. */
+}
+
+
+struct PFFFT_Setup {
+    int     N;
+    int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
+    int ifac[15];
+    pffft_transform_t transform;
+    v4sf *data; // allocated room for twiddle coefs
+    float *e;    // points into 'data' , N/4*3 elements
+    float *twiddle; // points into 'data', N/4 elements
+};
+
+PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
+{
+    PFFFT_Setup *s = new PFFFT_Setup{};
+    /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
+     * and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+     * handle other cases (or maybe just switch to a scalar fft, I don't know..)
+     */
+    if(transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
+    if(transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
+    //assert((N % 32) == 0);
+    s->N = N;
+    s->transform = transform;
+    /* nb of complex simd vectors */
+    s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+    s->data = static_cast<v4sf*>(pffft_aligned_malloc(2u*static_cast<unsigned>(s->Ncvec) * sizeof(v4sf)));
+    s->e = reinterpret_cast<float*>(s->data);
+    s->twiddle = reinterpret_cast<float*>(s->data + (2u*static_cast<unsigned>(s->Ncvec)*(SIMD_SZ-1))/SIMD_SZ);
+
+    if(transform == PFFFT_REAL)
+    {
+        for(int k=0; k < s->Ncvec; ++k)
+        {
+            int i = k/SIMD_SZ;
+            int j = k%SIMD_SZ;
+            for(int m=0; m < SIMD_SZ-1; ++m)
+            {
+                const double A = -2.0*al::numbers::pi*(m+1)*k / N;
+                s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = static_cast<float>(std::cos(A));
+                s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = static_cast<float>(std::sin(A));
+            }
+        }
+        rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+    }
+    else
+    {
+        for(int k=0; k < s->Ncvec; ++k)
+        {
+            int i = k/SIMD_SZ;
+            int j = k%SIMD_SZ;
+            for(int m=0; m < SIMD_SZ-1; ++m)
+            {
+                const double A = -2.0*al::numbers::pi*(m+1)*k / N;
+                s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
+                s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
+            }
+        }
+        cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+    }
+
+    /* check that N is decomposable with allowed prime factors */
+    int m = 1;
+    for(int k=0; k < s->ifac[1]; ++k)
+        m *= s->ifac[2+k];
+
+    if(m != N/SIMD_SZ)
+    {
+        pffft_destroy_setup(s);
+        s = nullptr;
+    }
+
+    return s;
+}
+
+
+void pffft_destroy_setup(PFFFT_Setup *s)
+{
+    pffft_aligned_free(s->data);
+    delete s;
+}
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
+static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out)
+{
+    v4sf g0, g1;
+    INTERLEAVE2(in[0], in[1], g0, g1);
+    in += in_stride;
+
+    *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
+    for(int k=1; k < N; ++k)
+    {
+        v4sf h0, h1;
+        INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
+        *--out = VSWAPHL(g1, h0);
+        *--out = VSWAPHL(h0, h1);
+        g1 = h1;
+    }
+    *--out = VSWAPHL(g1, g0);
+}
+
+static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride)
+{
+    v4sf g0, g1, h0, h1;
+    g0 = g1 = in[0]; ++in;
+    for(int k=1; k < N; ++k)
+    {
+        h0 = *in++; h1 = *in++;
+        g1 = VSWAPHL(g1, h0);
+        h0 = VSWAPHL(h0, h1);
+        UNINTERLEAVE2(h0, g1, out[0], out[1]);
+        out += out_stride;
+        g1 = h1;
+    }
+    h0 = *in++; h1 = g0;
+    g1 = VSWAPHL(g1, h0);
+    h0 = VSWAPHL(h0, h1);
+    UNINTERLEAVE2(h0, g1, out[0], out[1]);
+}
+
+void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
+{
+    const int N = setup->N, Ncvec = setup->Ncvec;
+    const v4sf *vin = reinterpret_cast<const v4sf*>(in);
+    v4sf *vout = reinterpret_cast<v4sf*>(out);
+    assert(in != out);
+    if(setup->transform == PFFFT_REAL)
+    {
+        const int dk = N/32;
+        if(direction == PFFFT_FORWARD)
+        {
+            for(int k=0; k < dk; ++k)
+            {
+                INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+                INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+            }
+            reversed_copy(dk, vin+2, 8, reinterpret_cast<v4sf*>(out + N/2));
+            reversed_copy(dk, vin+6, 8, reinterpret_cast<v4sf*>(out + N));
+        }
+        else
+        {
+            for(int k=0; k < dk; ++k)
+            {
+                UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+                UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+            }
+            unreversed_copy(dk, reinterpret_cast<const v4sf*>(in + N/4), reinterpret_cast<v4sf*>(out + N - 6*SIMD_SZ), -8);
+            unreversed_copy(dk, reinterpret_cast<const v4sf*>(in + 3*N/4), reinterpret_cast<v4sf*>(out + N - 2*SIMD_SZ), -8);
+        }
+    }
+    else
+    {
+        if(direction == PFFFT_FORWARD)
+        {
+            for(int k=0; k < Ncvec; ++k)
+            {
+                int kk = (k/4) + (k%4)*(Ncvec/4);
+                INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+            }
+        }
+        else
+        {
+            for(int k=0; k < Ncvec; ++k)
+            {
+                int kk = (k/4) + (k%4)*(Ncvec/4);
+                UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+            }
+        }
+    }
+}
+
+void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+{
+    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+    v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+    v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+    assert(in != out);
+    for(int k=0; k < dk; ++k)
+    {
+        r0 = in[8*k+0]; i0 = in[8*k+1];
+        r1 = in[8*k+2]; i1 = in[8*k+3];
+        r2 = in[8*k+4]; i2 = in[8*k+5];
+        r3 = in[8*k+6]; i3 = in[8*k+7];
+        VTRANSPOSE4(r0,r1,r2,r3);
+        VTRANSPOSE4(i0,i1,i2,i3);
+        VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
+        VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
+        VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
+
+        sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+        sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+        si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+        si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+        /*
+         * transformation for each column is:
+         *
+         * [1   1   1   1   0   0   0   0]   [r0]
+         * [1   0  -1   0   0  -1   0   1]   [r1]
+         * [1  -1   1  -1   0   0   0   0]   [r2]
+         * [1   0  -1   0   0   1   0  -1]   [r3]
+         * [0   0   0   0   1   1   1   1] * [i0]
+         * [0   1   0  -1   1   0  -1   0]   [i1]
+         * [0   0   0   0   1  -1   1  -1]   [i2]
+         * [0  -1   0   1   1   0  -1   0]   [i3]
+         */
+
+        r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+        r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
+        r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+        r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
+
+        *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+        *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+    }
+}
+
+void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+{
+    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+    v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+    v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+    assert(in != out);
+    for(int k=0; k < dk; ++k)
+    {
+        r0 = in[8*k+0]; i0 = in[8*k+1];
+        r1 = in[8*k+2]; i1 = in[8*k+3];
+        r2 = in[8*k+4]; i2 = in[8*k+5];
+        r3 = in[8*k+6]; i3 = in[8*k+7];
+
+        sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+        sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+        si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+        si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+        r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+        r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
+        r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+        r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
+
+        VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
+        VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
+        VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
+
+        VTRANSPOSE4(r0,r1,r2,r3);
+        VTRANSPOSE4(i0,i1,i2,i3);
+
+        *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+        *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+    }
+}
+
+
+static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1,
+    const v4sf *in, const v4sf *e, v4sf *out)
+{
+    v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+    v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+    r0 = *in0; i0 = *in1;
+    r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+
+    /*
+     * transformation for each column is:
+     *
+     * [1   1   1   1   0   0   0   0]   [r0]
+     * [1   0  -1   0   0  -1   0   1]   [r1]
+     * [1   0  -1   0   0   1   0  -1]   [r2]
+     * [1  -1   1  -1   0   0   0   0]   [r3]
+     * [0   0   0   0   1   1   1   1] * [i0]
+     * [0  -1   0   1  -1   0   1   0]   [i1]
+     * [0  -1   0   1   1   0  -1   0]   [i2]
+     * [0   0   0   0  -1   1  -1   1]   [i3]
+     */
+
+    //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
+    //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
+
+    VCPLXMUL(r1,i1,e[0],e[1]);
+    VCPLXMUL(r2,i2,e[2],e[3]);
+    VCPLXMUL(r3,i3,e[4],e[5]);
+
+    //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
+    //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
+    si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
+
+    r0 = VADD(sr0, sr1);
+    r3 = VSUB(sr0, sr1);
+    i0 = VADD(si0, si1);
+    i3 = VSUB(si1, si0);
+    r1 = VADD(dr0, di1);
+    r2 = VSUB(dr0, di1);
+    i1 = VSUB(dr1, di0);
+    i2 = VADD(dr1, di0);
+
+    *out++ = r0;
+    *out++ = i0;
+    *out++ = r1;
+    *out++ = i1;
+    *out++ = r2;
+    *out++ = i2;
+    *out++ = r3;
+    *out++ = i3;
+}
+
+static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+{
+    static constexpr float s = al::numbers::sqrt2_v<float>/2.0f;
+    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+    v4sf_union cr, ci, *uout = reinterpret_cast<v4sf_union*>(out);
+    v4sf save = in[7], zero=VZERO();
+    float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+
+    cr.v = in[0]; ci.v = in[Ncvec*2-1];
+    assert(in != out);
+    pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
+
+    /*
+     * [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+     *
+     * [Xr(1)]  ] [1   1   1   1   0   0   0   0]
+     * [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
+     * [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
+     * [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
+     * [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
+     * [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
+     * [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
+     * [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
+     */
+
+    xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
+    xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
+    xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
+    xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
+    xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
+    xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
+    xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
+    xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
+
+    for(int k=1; k < dk; ++k)
+    {
+        v4sf save_next = in[8*k+7];
+        pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
+        save = save_next;
+    }
+}
+
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
+    int first)
+{
+    v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
+    /*
+     * transformation for each column is:
+     *
+     * [1   1   1   1   0   0   0   0]   [r0]
+     * [1   0   0  -1   0  -1  -1   0]   [r1]
+     * [1  -1  -1   1   0   0   0   0]   [r2]
+     * [1   0   0  -1   0   1   1   0]   [r3]
+     * [0   0   0   0   1  -1   1  -1] * [i0]
+     * [0  -1   1   0   1   0   0   1]   [i1]
+     * [0   0   0   0   1   1  -1  -1]   [i2]
+     * [0   1  -1   0   1   0   0   1]   [i3]
+     */
+
+    v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
+    v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
+    v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
+    v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
+
+    r0 = VADD(sr0, sr1);
+    r2 = VSUB(sr0, sr1);
+    r1 = VSUB(dr0, si1);
+    r3 = VADD(dr0, si1);
+    i0 = VSUB(di0, di1);
+    i2 = VADD(di0, di1);
+    i1 = VSUB(si0, dr1);
+    i3 = VADD(si0, dr1);
+
+    VCPLXMULCONJ(r1,i1,e[0],e[1]);
+    VCPLXMULCONJ(r2,i2,e[2],e[3]);
+    VCPLXMULCONJ(r3,i3,e[4],e[5]);
+
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+
+    if(!first)
+    {
+        *out++ = r0;
+        *out++ = i0;
+    }
+    *out++ = r1;
+    *out++ = i1;
+    *out++ = r2;
+    *out++ = i2;
+    *out++ = r3;
+    *out++ = i3;
+}
+
+static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+{
+    static constexpr float s = al::numbers::sqrt2_v<float>;
+    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+    v4sf_union Xr, Xi, *uout = reinterpret_cast<v4sf_union*>(out);
+    float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
+    assert(in != out);
+    for(int k=0; k < 4; ++k)
+    {
+        Xr.f[k] = reinterpret_cast<const float*>(in)[8*k];
+        Xi.f[k] = reinterpret_cast<const float*>(in)[8*k+4];
+    }
+
+    pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
+
+    /*
+     * [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+     *
+     * [cr0] [1   0   2   0   1   0   0   0]
+     * [cr1] [1   0   0   0  -1   0  -2   0]
+     * [cr2] [1   0  -2   0   1   0   0   0]
+     * [cr3] [1   0   0   0  -1   0   2   0]
+     * [ci0] [0   2   0   2   0   0   0   0]
+     * [ci1] [0   s   0  -s   0  -s   0  -s]
+     * [ci2] [0   0   0   0   0  -2   0   2]
+     * [ci3] [0  -s   0   s   0  -s   0  -s]
+     */
+    for(int k=1; k < dk; ++k)
+        pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
+
+    cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
+    cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
+    cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
+    cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
+    ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
+    ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
+    ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
+    ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
+}
+
+
+void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput,
+    v4sf *scratch, pffft_direction_t direction, int ordered)
+{
+    const int Ncvec  = setup->Ncvec;
+    const int nf_odd = (setup->ifac[1] & 1);
+
+    // temporary buffer is allocated on the stack if the scratch pointer is NULL
+    assert(scratch != nullptr);
+
+    const v4sf *vinput = reinterpret_cast<const v4sf*>(finput);
+    v4sf *voutput      = reinterpret_cast<v4sf*>(foutput);
+    v4sf *buff[2]      = { voutput, scratch };
+    int ib = (nf_odd ^ ordered ? 1 : 0);
+
+    assert(VALIGNED(finput) && VALIGNED(foutput));
+
+    //assert(finput != foutput);
+    if(direction == PFFFT_FORWARD)
+    {
+        ib = !ib;
+        if(setup->transform == PFFFT_REAL)
+        {
+            ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+            pffft_real_finalize(Ncvec, buff[ib], buff[!ib], reinterpret_cast<v4sf*>(setup->e));
+        }
+        else
+        {
+            v4sf *tmp = buff[ib];
+            for(int k=0; k < Ncvec; ++k)
+            {
+                UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
+            }
+            ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, &setup->ifac[0], -1.0f) == buff[0] ? 0 : 1);
+            pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], reinterpret_cast<v4sf*>(setup->e));
+        }
+        if(ordered)
+            pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]), reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
+        else
+            ib = !ib;
+    }
+    else
+    {
+        if(vinput == buff[ib])
+            ib = !ib; // may happen when finput == foutput
+
+        if(ordered)
+        {
+            pffft_zreorder(setup, reinterpret_cast<const float*>(vinput), reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
+            vinput = buff[ib];
+            ib = !ib;
+        }
+        if(setup->transform == PFFFT_REAL)
+        {
+            pffft_real_preprocess(Ncvec, vinput, buff[ib], reinterpret_cast<v4sf*>(setup->e));
+            ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+        }
+        else
+        {
+            pffft_cplx_preprocess(Ncvec, vinput, buff[ib], reinterpret_cast<v4sf*>(setup->e));
+            ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, &setup->ifac[0], +1.0f) == buff[0] ? 0 : 1);
+            for(int k=0; k < Ncvec; ++k)
+            {
+                INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
+            }
+        }
+    }
+
+    if(buff[ib] != voutput)
+    {
+        /* extra copy required -- this situation should only happen when finput == foutput */
+        assert(finput==foutput);
+        for(int k=0; k < Ncvec; ++k)
+        {
+            v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
+            voutput[2*k] = a; voutput[2*k+1] = b;
+        }
+        ib = !ib;
+    }
+    assert(buff[ib] == voutput);
+}
+
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+    float scaling)
+{
+    const int Ncvec = s->Ncvec;
+    const v4sf *RESTRICT va = reinterpret_cast<const v4sf*>(a);
+    const v4sf *RESTRICT vb = reinterpret_cast<const v4sf*>(b);
+    v4sf *RESTRICT vab = reinterpret_cast<v4sf*>(ab);
+
+#ifdef __arm__
+    __builtin_prefetch(va);
+    __builtin_prefetch(vb);
+    __builtin_prefetch(vab);
+    __builtin_prefetch(va+2);
+    __builtin_prefetch(vb+2);
+    __builtin_prefetch(vab+2);
+    __builtin_prefetch(va+4);
+    __builtin_prefetch(vb+4);
+    __builtin_prefetch(vab+4);
+    __builtin_prefetch(va+6);
+    __builtin_prefetch(vb+6);
+    __builtin_prefetch(vab+6);
+#ifndef __clang__
+#define ZCONVOLVE_USING_INLINE_NEON_ASM
+#endif
+#endif
+
+#ifndef ZCONVOLVE_USING_INLINE_ASM
+    const v4sf vscal = LD_PS1(scaling);
+#endif
+    assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+    float ar1 = reinterpret_cast<const v4sf_union*>(va)[0].f[0];
+    float ai1 = reinterpret_cast<const v4sf_union*>(va)[1].f[0];
+    float br1 = reinterpret_cast<const v4sf_union*>(vb)[0].f[0];
+    float bi1 = reinterpret_cast<const v4sf_union*>(vb)[1].f[0];
+    float abr1 = reinterpret_cast<v4sf_union*>(vab)[0].f[0];
+    float abi1 = reinterpret_cast<v4sf_union*>(vab)[1].f[0];
+
+#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
+    const float *a_ = a, *b_ = b; float *ab_ = ab;
+    int N = Ncvec;
+    asm volatile("mov         r8, %2                  \n"
+                "vdup.f32    q15, %4                 \n"
+                "1:                                  \n"
+                "pld         [%0,#64]                \n"
+                "pld         [%1,#64]                \n"
+                "pld         [%2,#64]                \n"
+                "pld         [%0,#96]                \n"
+                "pld         [%1,#96]                \n"
+                "pld         [%2,#96]                \n"
+                "vld1.f32    {q0,q1},   [%0,:128]!         \n"
+                "vld1.f32    {q4,q5},   [%1,:128]!         \n"
+                "vld1.f32    {q2,q3},   [%0,:128]!         \n"
+                "vld1.f32    {q6,q7},   [%1,:128]!         \n"
+                "vld1.f32    {q8,q9},   [r8,:128]!          \n"
+
+                "vmul.f32    q10, q0, q4             \n"
+                "vmul.f32    q11, q0, q5             \n"
+                "vmul.f32    q12, q2, q6             \n"
+                "vmul.f32    q13, q2, q7             \n"
+                "vmls.f32    q10, q1, q5             \n"
+                "vmla.f32    q11, q1, q4             \n"
+                "vld1.f32    {q0,q1}, [r8,:128]!     \n"
+                "vmls.f32    q12, q3, q7             \n"
+                "vmla.f32    q13, q3, q6             \n"
+                "vmla.f32    q8, q10, q15            \n"
+                "vmla.f32    q9, q11, q15            \n"
+                "vmla.f32    q0, q12, q15            \n"
+                "vmla.f32    q1, q13, q15            \n"
+                "vst1.f32    {q8,q9},[%2,:128]!    \n"
+                "vst1.f32    {q0,q1},[%2,:128]!    \n"
+                "subs        %3, #2                  \n"
+                "bne         1b                      \n"
+                : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
+
+#else // default routine, works fine for non-arm cpus with current compilers
+
+    for(int i=0; i < Ncvec; i += 2)
+    {
+        v4sf ar4, ai4, br4, bi4;
+        ar4 = va[2*i+0]; ai4 = va[2*i+1];
+        br4 = vb[2*i+0]; bi4 = vb[2*i+1];
+        VCPLXMUL(ar4, ai4, br4, bi4);
+        vab[2*i+0] = VMADD(ar4, vscal, vab[2*i+0]);
+        vab[2*i+1] = VMADD(ai4, vscal, vab[2*i+1]);
+        ar4 = va[2*i+2]; ai4 = va[2*i+3];
+        br4 = vb[2*i+2]; bi4 = vb[2*i+3];
+        VCPLXMUL(ar4, ai4, br4, bi4);
+        vab[2*i+2] = VMADD(ar4, vscal, vab[2*i+2]);
+        vab[2*i+3] = VMADD(ai4, vscal, vab[2*i+3]);
+    }
+#endif
+
+    if(s->transform == PFFFT_REAL)
+    {
+        reinterpret_cast<v4sf_union*>(vab)[0].f[0] = abr1 + ar1*br1*scaling;
+        reinterpret_cast<v4sf_union*>(vab)[1].f[0] = abi1 + ai1*bi1*scaling;
+    }
+}
+
+
+#else // defined(PFFFT_SIMD_DISABLE)
+
+// standard routine using scalar floats, without SIMD stuff.
+
+#define pffft_zreorder_nosimd pffft_zreorder
+void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
+    pffft_direction_t direction)
+{
+    const int N = setup->N;
+    if(setup->transform == PFFFT_COMPLEX)
+    {
+        for(int k=0; k < 2*N; ++k)
+            out[k] = in[k];
+        return;
+    }
+    else if(direction == PFFFT_FORWARD)
+    {
+        float x_N = in[N-1];
+        for(int k=N-1; k > 1; --k)
+            out[k] = in[k-1];
+        out[0] = in[0];
+        out[1] = x_N;
+    }
+    else
+    {
+        float x_N = in[1];
+        for(int k=1; k < N-1; ++k)
+            out[k] = in[k+1];
+        out[0] = in[0];
+        out[N-1] = x_N;
+    }
+}
+
+#define pffft_transform_internal_nosimd pffft_transform_internal
+void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
+    float *scratch, pffft_direction_t direction, int ordered)
+{
+    const int Ncvec  = setup->Ncvec;
+    const int nf_odd = (setup->ifac[1] & 1);
+
+    assert(scratch != nullptr);
+
+    if(setup->transform == PFFFT_COMPLEX)
+        ordered = 0; // it is always ordered.
+    int ib = (nf_odd ^ ordered ? 1 : 0);
+    float *buff[2] = { output, scratch };
+
+    if(direction == PFFFT_FORWARD)
+    {
+        if(setup->transform == PFFFT_REAL)
+            ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+        else
+            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], -1.0f) == buff[0] ? 0 : 1);
+        if(ordered)
+        {
+            pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
+            ib = !ib;
+        }
+    }
+    else
+    {
+        if (input == buff[ib])
+            ib = !ib; // may happen when finput == foutput
+
+        if(ordered)
+        {
+            pffft_zreorder(setup, input, buff[ib], PFFFT_BACKWARD);
+            input = buff[ib];
+            ib = !ib;
+        }
+        if(setup->transform == PFFFT_REAL)
+            ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],  setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+        else
+            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], +1.0f) == buff[0] ? 0 : 1);
+    }
+    if(buff[ib] != output)
+    {
+        // extra copy required -- this situation should happens only when finput == foutput
+        assert(input==output);
+        for(int k=0; k < Ncvec; ++k)
+        {
+            float a = buff[ib][2*k], b = buff[ib][2*k+1];
+            output[2*k] = a; output[2*k+1] = b;
+        }
+        ib = !ib;
+    }
+    assert(buff[ib] == output);
+}
+
+#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
+void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+    float scaling)
+{
+    int Ncvec = s->Ncvec;
+
+    if(s->transform == PFFFT_REAL)
+    {
+        // take care of the fftpack ordering
+        ab[0] += a[0]*b[0]*scaling;
+        ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
+        ++ab; ++a; ++b; --Ncvec;
+    }
+    for(int i=0; i < Ncvec; ++i)
+    {
+        float ar       = a[2*i+0], ai = a[2*i+1];
+        const float br = b[2*i+0], bi = b[2*i+1];
+        VCPLXMUL(ar, ai, br, bi);
+        ab[2*i+0] += ar*scaling;
+        ab[2*i+1] += ai*scaling;
+    }
+}
+
+#endif // defined(PFFFT_SIMD_DISABLE)
+
+void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+{
+    pffft_transform_internal(setup, input, output, reinterpret_cast<v4sf*>(work), direction, 0);
+}
+
+void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+{
+    pffft_transform_internal(setup, input, output, reinterpret_cast<v4sf*>(work), direction, 1);
+}
-- 
cgit v1.2.3


From 3ccde151c609ba8fed59f07277ca5c719b2b92fc Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Fri, 6 Oct 2023 01:20:24 -0700
Subject: Fix x86-64 MSVC check

---
 common/pffft.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 5b3b25e7..d42f7baf 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -124,7 +124,7 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
 /*
  * SSE1 support macros
  */
-#elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X86) || \
+#elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
 
 #include <xmmintrin.h>
-- 
cgit v1.2.3


From 393790de91b7ab81c75f7ebff7874a3c92dc6bbf Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Fri, 6 Oct 2023 21:06:03 -0700
Subject: Add a generic GCC vector extension fallback for pffft

Also combine multiple allocations into one.
---
 common/pffft.cpp | 93 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 13 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index d42f7baf..146afef5 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -170,6 +170,63 @@ typedef float32x4_t v4sf;
 #define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
 #define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
 
+/*
+ * Generic GCC vector macros
+ */
+#elif defined(__GNUC__)
+
+using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
+#define SIMD_SZ 4
+#define VZERO() v4sf{0,0,0,0}
+#define VMUL(a,b) ((a) * (b))
+#define VADD(a,b) ((a) + (b))
+#define VMADD(a,b,c) ((a)*(b) + (c))
+#define VSUB(a,b) ((a) - (b))
+#define SVMUL(f,v) ((f) * (v))
+
+constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
+#define LD_PS1 ld_ps1
+
+[[gnu::always_inline]] inline v4sf unpacklo(v4sf a, v4sf b) noexcept
+{ return v4sf{a[0], b[0], a[1], b[1]}; }
+[[gnu::always_inline]] inline v4sf unpackhi(v4sf a, v4sf b) noexcept
+{ return v4sf{a[2], b[2], a[3], b[3]}; }
+
+[[gnu::always_inline]] inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    v4sf tmp__{unpacklo(in1, in2)};
+    out2 = unpackhi(in1, in2);
+    out1 = tmp__;
+}
+#define INTERLEAVE2 interleave2
+
+[[gnu::always_inline]] inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    v4sf tmp__{in1[0], in1[2], in2[0], in2[2]};
+    out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
+    out1 = tmp__;
+}
+#define UNINTERLEAVE2 uninterleave2
+
+[[gnu::always_inline]] inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{
+    v4sf tmp0 = unpacklo(x0, x1);
+    v4sf tmp2 = unpacklo(x2, x3);
+    v4sf tmp1 = unpackhi(x0, x1);
+    v4sf tmp3 = unpackhi(x2, x3);
+    x0 = v4sf{tmp0[0], tmp0[1], tmp2[0], tmp2[1]};
+    x1 = v4sf{tmp0[2], tmp0[3], tmp2[2], tmp2[3]};
+    x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
+    x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
+}
+#define VTRANSPOSE4 vtranspose4
+
+[[gnu::always_inline]] inline v4sf vswaphl(v4sf a, v4sf b) noexcept
+{ return v4sf{b[0], b[1], a[2], a[3]}; }
+#define VSWAPHL vswaphl
+
+#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
+
 #else
 
 #warning "building with simd disabled !\n";
@@ -192,8 +249,8 @@ typedef float v4sf;
 #endif
 
 // shortcuts for complex multiplcations
-#define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } while(0)
-#define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } while(0)
+#define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0)
+#define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0)
 #ifndef SVMUL
 // multiply a scalar with a vector
 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
@@ -1272,28 +1329,38 @@ struct PFFFT_Setup {
     int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
     int ifac[15];
     pffft_transform_t transform;
-    v4sf *data; // allocated room for twiddle coefs
     float *e;    // points into 'data' , N/4*3 elements
     float *twiddle; // points into 'data', N/4 elements
+
+    alignas(MALLOC_V4SF_ALIGNMENT) v4sf data[1];
 };
 
 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
 {
-    PFFFT_Setup *s = new PFFFT_Setup{};
+    assert(transform == PFFFT_REAL || transform == PFFFT_COMPLEX);
+    assert(N > 0);
     /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
      * and 32 for real FFTs -- a lot of stuff would need to be rewritten to
      * handle other cases (or maybe just switch to a scalar fft, I don't know..)
      */
-    if(transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
-    if(transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
-    //assert((N % 32) == 0);
+    if(transform == PFFFT_REAL)
+        assert((N%(2*SIMD_SZ*SIMD_SZ)) == 0);
+    else
+        assert((N%(SIMD_SZ*SIMD_SZ)) == 0);
+
+    const unsigned int Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+    size_t storelen{offsetof(PFFFT_Setup, data[0]) + (2u*Ncvec * sizeof(v4sf))};
+
+    void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)};
+    if(!store) return nullptr;
+
+    PFFFT_Setup *s = ::new(store) PFFFT_Setup{};
     s->N = N;
     s->transform = transform;
     /* nb of complex simd vectors */
-    s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
-    s->data = static_cast<v4sf*>(pffft_aligned_malloc(2u*static_cast<unsigned>(s->Ncvec) * sizeof(v4sf)));
-    s->e = reinterpret_cast<float*>(s->data);
-    s->twiddle = reinterpret_cast<float*>(s->data + (2u*static_cast<unsigned>(s->Ncvec)*(SIMD_SZ-1))/SIMD_SZ);
+    s->Ncvec = static_cast<int>(Ncvec);
+    s->e = reinterpret_cast<float*>(&s->data[0]);
+    s->twiddle = reinterpret_cast<float*>(&s->data[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]);
 
     if(transform == PFFFT_REAL)
     {
@@ -1343,8 +1410,8 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
 
 void pffft_destroy_setup(PFFFT_Setup *s)
 {
-    pffft_aligned_free(s->data);
-    delete s;
+    std::destroy_at(s);
+    al_free(s);
 }
 
 #if !defined(PFFFT_SIMD_DISABLE)
-- 
cgit v1.2.3


From 5ed78a9f93d5703dd1290c880cd137a0f22ba8f8 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Sat, 7 Oct 2023 02:33:22 -0700
Subject: Update and clarify some comments

---
 common/pffft.cpp |   8 +--
 common/pffft.h   | 176 ++++++++++++++++++++++++++++---------------------------
 2 files changed, 93 insertions(+), 91 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 146afef5..0c2dd940 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -79,8 +79,7 @@
 #endif
 
 
-/*
- * vector support macros: the rest of the code is independant of
+/* Vector support macros: the rest of the code is independent of
  * SSE/Altivec/NEON -- adding support for other platforms with 4-element
  * vectors should be limited to these macros
  */
@@ -248,7 +247,7 @@ typedef float v4sf;
 #define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
 #endif
 
-// shortcuts for complex multiplcations
+// shortcuts for complex multiplications
 #define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0)
 #define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0)
 #ifndef SVMUL
@@ -1866,6 +1865,8 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
     float scaling)
 {
+    assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+
     const int Ncvec = s->Ncvec;
     const v4sf *RESTRICT va = reinterpret_cast<const v4sf*>(a);
     const v4sf *RESTRICT vb = reinterpret_cast<const v4sf*>(b);
@@ -1892,7 +1893,6 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 #ifndef ZCONVOLVE_USING_INLINE_ASM
     const v4sf vscal = LD_PS1(scaling);
 #endif
-    assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
     float ar1 = reinterpret_cast<const v4sf_union*>(va)[0].f[0];
     float ai1 = reinterpret_cast<const v4sf_union*>(va)[1].f[0];
     float br1 = reinterpret_cast<const v4sf_union*>(vb)[0].f[0];
diff --git a/common/pffft.h b/common/pffft.h
index d9dfa808..87d10216 100644
--- a/common/pffft.h
+++ b/common/pffft.h
@@ -45,34 +45,36 @@
    SOFTWARE.
 */
 
-/*
-   PFFFT : a Pretty Fast FFT.
-
-   This is basically an adaptation of the single precision fftpack
-   (v4) as found on netlib taking advantage of SIMD instruction found
-   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
-
-   For architectures where no SIMD instruction is available, the code
-   falls back to a scalar version.
-
-   Restrictions:
-
-   - 1D transforms only, with 32-bit single precision.
-
-   - supports only transforms for inputs of length N of the form
-   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
-   144, 160, etc are all acceptable lengths). Performance is best for
-   128<=N<=8192.
-
-   - all (float*) pointers in the functions below are expected to
-   have an "simd-compatible" alignment, that is 16 bytes on x86 and
-   powerpc CPUs.
-
-   You can allocate such buffers with the functions
-   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
-   posix_memalign..)
-
-*/
+/* PFFFT : a Pretty Fast FFT.
+ *
+ * This is basically an adaptation of the single precision fftpack (v4) as
+ * found on netlib taking advantage of SIMD instructions found on CPUs such as
+ * Intel x86 (SSE1), PowerPC (Altivec), and Arm (NEON).
+ *
+ * For architectures where SIMD instructions aren't available, the code falls
+ * back to a scalar version.
+ *
+ * Restrictions:
+ *
+ * - 1D transforms only, with 32-bit single precision.
+ *
+ * - supports only transforms for inputs of length N of the form
+ * N=(2^a)*(3^b)*(5^c), given a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, 144,
+ * 160, etc are all acceptable lengths). Performance is best for 128<=N<=8192.
+ *
+ * - all (float*) pointers for the functions below are expected to have a
+ * "SIMD-compatible" alignment, that is 16 bytes.
+ *
+ * You can allocate such buffers with the pffft_aligned_malloc function, and
+ * deallocate them with pffft_aligned_free (or with stuff like posix_memalign,
+ * aligned_alloc, etc).
+ *
+ * Note that for the z-domain data of real transforms, when in the canonical
+ * order (as interleaved complex numbers) both 0-frequency and half-frequency
+ * components, which are real, are assembled in the first entry as
+ * F(0)+i*F(n/2+1). The original fftpack placed F(n/2+1) at the end of the
+ * arrays instead.
+ */
 
 #ifndef PFFFT_H
 #define PFFFT_H
@@ -100,77 +102,77 @@ typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
 
 #endif
 
-/*
-   prepare for performing transforms of size N -- the returned
-   PFFFT_Setup structure is read-only so it can safely be shared by
-   multiple concurrent threads.
-*/
+/**
+ * Prepare for performing transforms of size N -- the returned PFFFT_Setup
+ * structure is read-only so it can safely be shared by multiple concurrent
+ * threads.
+ */
 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
-void pffft_destroy_setup(PFFFT_Setup *);
-/*
-   Perform a Fourier transform , The z-domain data is stored in the
-   most efficient order for transforming it back, or using it for
-   convolution. If you need to have its content sorted in the
-   "usual" way, that is as an array of interleaved complex numbers,
-   either use pffft_transform_ordered , or call pffft_zreorder after
-   the forward fft, and before the backward fft.
-
-   Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
-   Typically you will want to scale the backward transform by 1/N.
-
-   The 'work' pointer must point to an area of N (2*N for complex
-   fft) floats, properly aligned. It cannot be NULL.
-
-   input and output may alias.
-*/
+void pffft_destroy_setup(PFFFT_Setup *setup);
+
+/**
+ * Perform a Fourier transform. The z-domain data is stored in the most
+ * efficient order for transforming back or using for convolution, and as
+ * such, there's no guarantee to the order of the values. If you need to have
+ * its content sorted in the usual way, that is as an array of interleaved
+ * complex numbers, either use pffft_transform_ordered, or call pffft_zreorder
+ * after the forward fft and before the backward fft.
+ *
+ * Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. Typically
+ * you will want to scale the backward transform by 1/N.
+ *
+ * The 'work' pointer must point to an area of N (2*N for complex fft) floats,
+ * properly aligned. It cannot be NULL.
+ *
+ * The input and output parameters may alias.
+ */
 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
 
-/*
-   Similar to pffft_transform, but makes sure that the output is
-   ordered as expected (interleaved complex numbers).  This is
-   similar to calling pffft_transform and then pffft_zreorder.
-
-   input and output may alias.
-*/
+/**
+ * Similar to pffft_transform, but handles the complex values in the usual form
+ * (interleaved complex numbers). This is similar to calling
+ * pffft_transform(..., PFFFT_FORWARD) followed by
+ * pffft_zreorder(..., PFFFT_FORWARD), or
+ * pffft_zreorder(..., PFFFT_BACKWARD) followed by
+ * pffft_transform(..., PFFFT_BACKWARD), for the given direction.
+ *
+ * The input and output parameters may alias.
+ */
 void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
 
-/*
-   call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
-   PFFFT_FORWARD) if you want to have the frequency components in
-   the correct "canonical" order, as interleaved complex numbers.
-
-   (for real transforms, both 0-frequency and half frequency
-   components, which are real, are assembled in the first entry as
-   F(0)+i*F(n/2+1). Note that the original fftpack did place
-   F(n/2+1) at the end of the arrays).
-
-   input and output should not alias.
-*/
+/**
+ * Reorder the z-domain data. For PFFFT_FORWARD, it reorders from the internal
+ * representation to the "canonical" order (as interleaved complex numbers).
+ * For PFFFT_BACKWARD, it reorders from the canonical order to the internal
+ * order suitable for pffft_transform(..., PFFFT_BACKWARD) or
+ * pffft_zconvolve_accumulate.
+ *
+ * The input and output parameters should not alias.
+ */
 void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
 
-/*
-   Perform a multiplication of the frequency components of dft_a and
-   dft_b and accumulate them into dft_ab. The arrays should have
-   been obtained with pffft_transform(.., PFFFT_FORWARD) and should
-   *not* have been reordered with pffft_zreorder (otherwise just
-   perform the operation yourself as the dft coefs are stored as
-   interleaved complex numbers).
-
-   the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
-
-   The dft_a, dft_b and dft_ab pointers may alias.
-*/
+/**
+ * Perform a multiplication of the z-domain data in dft_a and dft_b and
+ * accumulate them into dft_ab. The arrays should have been obtained with
+ * pffft_transform(..., PFFFT_FORWARD) or pffft_zreorder(..., PFFFT_BACKWARD)
+ * and should *not* be in the usual order (otherwise just perform the operation
+ * yourself as the dft coeffs are stored as interleaved complex numbers).
+ *
+ * The operation performed is: dft_ab += (dft_a * dft_b)*scaling
+ *
+ * The dft_a, dft_b, and dft_ab parameters may alias.
+ */
 void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
 
-/*
-   the float buffers must have the correct alignment (16-byte boundary
-   on intel and powerpc). This function may be used to obtain such
-   correctly aligned buffers.
-*/
+/**
+ * The float buffers must have the correct alignment (16-byte boundary on intel
+ * and powerpc). This function may be used to obtain such correctly aligned
+ * buffers.
+ */
 void *pffft_aligned_malloc(size_t nb_bytes);
 void pffft_aligned_free(void *);
 
-/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+/* Return 4 or 1 depending if vectorization was enable when building pffft.cpp. */
 int pffft_simd_size();
 
 #ifdef __cplusplus
-- 
cgit v1.2.3


From d59338e95f5617c71402a3a6be5c6f5f4168c501 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Sun, 8 Oct 2023 01:42:24 -0700
Subject: Avoid some type-punning and clean up pffft a bit

---
 common/pffft.cpp | 309 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 148 insertions(+), 161 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 0c2dd940..06ae66ec 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -57,14 +57,18 @@
 
 #include "pffft.h"
 
+#include <array>
 #include <assert.h>
 #include <cmath>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
+#include "albit.h"
 #include "almalloc.h"
 #include "alnumbers.h"
+#include "vector.h"
 
 #if defined(__GNUC__)
 #define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
@@ -1328,10 +1332,9 @@ struct PFFFT_Setup {
     int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
     int ifac[15];
     pffft_transform_t transform;
-    float *e;    // points into 'data' , N/4*3 elements
-    float *twiddle; // points into 'data', N/4 elements
 
-    alignas(MALLOC_V4SF_ALIGNMENT) v4sf data[1];
+    float *twiddle; // N/4 elements
+    alignas(MALLOC_V4SF_ALIGNMENT) v4sf e[1]; // N/4*3 elements
 };
 
 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
@@ -1348,7 +1351,7 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
         assert((N%(SIMD_SZ*SIMD_SZ)) == 0);
 
     const unsigned int Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
-    size_t storelen{offsetof(PFFFT_Setup, data[0]) + (2u*Ncvec * sizeof(v4sf))};
+    size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))};
 
     void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)};
     if(!store) return nullptr;
@@ -1358,39 +1361,28 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
     s->transform = transform;
     /* nb of complex simd vectors */
     s->Ncvec = static_cast<int>(Ncvec);
-    s->e = reinterpret_cast<float*>(&s->data[0]);
-    s->twiddle = reinterpret_cast<float*>(&s->data[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]);
+    s->twiddle = reinterpret_cast<float*>(&s->e[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]);
 
-    if(transform == PFFFT_REAL)
+    if constexpr(SIMD_SZ > 1)
     {
+        al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1));
         for(int k=0; k < s->Ncvec; ++k)
         {
-            int i = k/SIMD_SZ;
-            int j = k%SIMD_SZ;
-            for(int m=0; m < SIMD_SZ-1; ++m)
+            size_t i{static_cast<size_t>(k) / SIMD_SZ};
+            size_t j{static_cast<size_t>(k) % SIMD_SZ};
+            for(size_t m{0};m < SIMD_SZ-1;++m)
             {
-                const double A = -2.0*al::numbers::pi*(m+1)*k / N;
-                s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = static_cast<float>(std::cos(A));
-                s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = static_cast<float>(std::sin(A));
+                const double A = -2.0*al::numbers::pi*static_cast<double>(m+1)*k / N;
+                e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
+                e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
             }
         }
-        rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+        std::memcpy(s->e, e.data(), e.size()*sizeof(float));
     }
+    if(transform == PFFFT_REAL)
+        rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
     else
-    {
-        for(int k=0; k < s->Ncvec; ++k)
-        {
-            int i = k/SIMD_SZ;
-            int j = k%SIMD_SZ;
-            for(int m=0; m < SIMD_SZ-1; ++m)
-            {
-                const double A = -2.0*al::numbers::pi*(m+1)*k / N;
-                s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
-                s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
-            }
-        }
         cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
-    }
 
     /* check that N is decomposable with allowed prime factors */
     int m = 1;
@@ -1455,13 +1447,14 @@ static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride)
 
 void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
 {
-    const int N = setup->N, Ncvec = setup->Ncvec;
-    const v4sf *vin = reinterpret_cast<const v4sf*>(in);
-    v4sf *vout = reinterpret_cast<v4sf*>(out);
     assert(in != out);
+
+    const int N{setup->N}, Ncvec{setup->Ncvec};
+    const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
+    v4sf *vout{reinterpret_cast<v4sf*>(out)};
     if(setup->transform == PFFFT_REAL)
     {
-        const int dk = N/32;
+        const int dk{N/32};
         if(direction == PFFFT_FORWARD)
         {
             for(int k=0; k < dk; ++k)
@@ -1469,8 +1462,8 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
                 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
                 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
             }
-            reversed_copy(dk, vin+2, 8, reinterpret_cast<v4sf*>(out + N/2));
-            reversed_copy(dk, vin+6, 8, reinterpret_cast<v4sf*>(out + N));
+            reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2);
+            reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ);
         }
         else
         {
@@ -1479,8 +1472,8 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
                 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
                 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
             }
-            unreversed_copy(dk, reinterpret_cast<const v4sf*>(in + N/4), reinterpret_cast<v4sf*>(out + N - 6*SIMD_SZ), -8);
-            unreversed_copy(dk, reinterpret_cast<const v4sf*>(in + 3*N/4), reinterpret_cast<v4sf*>(out + N - 2*SIMD_SZ), -8);
+            unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8);
+            unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8);
         }
     }
     else
@@ -1504,31 +1497,29 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
     }
 }
 
-void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
-    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-    v4sf r0, i0, r1, i1, r2, i2, r3, i3;
-    v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
     assert(in != out);
+
+    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
     for(int k=0; k < dk; ++k)
     {
-        r0 = in[8*k+0]; i0 = in[8*k+1];
-        r1 = in[8*k+2]; i1 = in[8*k+3];
-        r2 = in[8*k+4]; i2 = in[8*k+5];
-        r3 = in[8*k+6]; i3 = in[8*k+7];
+        v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
+        v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
+        v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
+        v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
         VTRANSPOSE4(r0,r1,r2,r3);
         VTRANSPOSE4(i0,i1,i2,i3);
         VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
         VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
         VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
 
-        sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
-        sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
-        si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
-        si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+        v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
+        v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
+        v4sf si0{VADD(i0,i2)}, di0{VSUB(i0, i2)};
+        v4sf si1{VADD(i1,i3)}, di1{VSUB(i1, i3)};
 
-        /*
-         * transformation for each column is:
+        /* transformation for each column is:
          *
          * [1   1   1   1   0   0   0   0]   [r0]
          * [1   0  -1   0   0  -1   0   1]   [r1]
@@ -1550,23 +1541,22 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
     }
 }
 
-void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
-    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-    v4sf r0, i0, r1, i1, r2, i2, r3, i3;
-    v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
     assert(in != out);
+
+    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
     for(int k=0; k < dk; ++k)
     {
-        r0 = in[8*k+0]; i0 = in[8*k+1];
-        r1 = in[8*k+2]; i1 = in[8*k+3];
-        r2 = in[8*k+4]; i2 = in[8*k+5];
-        r3 = in[8*k+6]; i3 = in[8*k+7];
+        v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
+        v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
+        v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
+        v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
 
-        sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
-        sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
-        si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
-        si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+        v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
+        v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
+        v4sf si0{VADD(i0,i2)}, di0{VSUB(i0, i2)};
+        v4sf si1{VADD(i1,i3)}, di1{VSUB(i1, i3)};
 
         r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
         r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
@@ -1645,20 +1635,18 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
 
 static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
-    static constexpr float s = al::numbers::sqrt2_v<float>/2.0f;
-    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
-
-    v4sf_union cr, ci, *uout = reinterpret_cast<v4sf_union*>(out);
-    v4sf save = in[7], zero=VZERO();
-    float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+    static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
 
-    cr.v = in[0]; ci.v = in[Ncvec*2-1];
     assert(in != out);
+    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+    const v4sf zero{VZERO()};
+    const auto cr = al::bit_cast<std::array<float,SIMD_SZ>>(in[0]);
+    const auto ci = al::bit_cast<std::array<float,SIMD_SZ>>(in[Ncvec*2-1]);
     pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
 
-    /*
-     * [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+    /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
      *
      * [Xr(1)]  ] [1   1   1   1   0   0   0   0]
      * [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
@@ -1670,29 +1658,26 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
      * [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
      */
 
-    xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
-    xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
-    xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
-    xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
-    xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
-    xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
-    xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
-    xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
-
-    for(int k=1; k < dk; ++k)
-    {
-        v4sf save_next = in[8*k+7];
-        pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
-        save = save_next;
-    }
+    auto *uout = reinterpret_cast<v4sf_union*>(out);
+    const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; uout[0].f[0] = xr0;
+    const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; uout[1].f[0] = xi0;
+    const float xr2{(cr[0]-cr[2])};                 uout[4].f[0] = xr2;
+    const float xi2{(cr[3]-cr[1])};                 uout[5].f[0] = xi2;
+    const float xr1{ ci[0] + s*(ci[1]-ci[3])};      uout[2].f[0] = xr1;
+    const float xi1{-ci[2] - s*(ci[1]+ci[3])};      uout[3].f[0] = xi1;
+    const float xr3{ ci[0] - s*(ci[1]-ci[3])};      uout[6].f[0] = xr3;
+    const float xi3{ ci[2] - s*(ci[1]+ci[3])};      uout[7].f[0] = xi3;
+
+    for(int k{1};k < dk;++k)
+        pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
 }
 
 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
     int first)
 {
     v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
-    /*
-     * transformation for each column is:
+
+    /* transformation for each column is:
      *
      * [1   1   1   1   0   0   0   0]   [r0]
      * [1   0   0  -1   0  -1  -1   0]   [r1]
@@ -1741,22 +1726,22 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf
 static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     static constexpr float s = al::numbers::sqrt2_v<float>;
-    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
 
-    v4sf_union Xr, Xi, *uout = reinterpret_cast<v4sf_union*>(out);
-    float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
     assert(in != out);
-    for(int k=0; k < 4; ++k)
+    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+    std::array<float,SIMD_SZ> Xr, Xi;
+    for(size_t k{0};k < 4;++k)
     {
-        Xr.f[k] = reinterpret_cast<const float*>(in)[8*k];
-        Xi.f[k] = reinterpret_cast<const float*>(in)[8*k+4];
+        /* TODO: Use _mm_cvtss_f32 or equivalent. */
+        Xr[k] = al::bit_cast<std::array<float,SIMD_SZ>>(in[4*k])[0];
+        Xi[k] = al::bit_cast<std::array<float,SIMD_SZ>>(in[4*k + 1])[0];
     }
 
     pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
 
-    /*
-     * [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+    /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
      *
      * [cr0] [1   0   2   0   1   0   0   0]
      * [cr1] [1   0   0   0  -1   0  -2   0]
@@ -1767,57 +1752,60 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
      * [ci2] [0   0   0   0   0  -2   0   2]
      * [ci3] [0  -s   0   s   0  -s   0  -s]
      */
-    for(int k=1; k < dk; ++k)
+    for(int k{1};k < dk;++k)
         pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
 
-    cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
-    cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
-    cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
-    cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
-    ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
-    ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
-    ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
-    ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
+    /* TODO: Use _mm_set_ps or equivalent. */
+    auto *uout = reinterpret_cast<v4sf_union*>(out);
+    const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]}; uout[0].f[0] = cr0;
+    const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]}; uout[0].f[1] = cr1;
+    const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]}; uout[0].f[2] = cr2;
+    const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]}; uout[0].f[3] = cr3;
+    const float ci0{ 2*(Xr[1]+Xr[3])};                   uout[2*Ncvec-1].f[0] = ci0;
+    const float ci1{ s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; uout[2*Ncvec-1].f[1] = ci1;
+    const float ci2{ 2*(Xi[3]-Xi[1])};                   uout[2*Ncvec-1].f[2] = ci2;
+    const float ci3{-s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; uout[2*Ncvec-1].f[3] = ci3;
 }
 
 
 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput,
     v4sf *scratch, pffft_direction_t direction, int ordered)
 {
-    const int Ncvec  = setup->Ncvec;
-    const int nf_odd = (setup->ifac[1] & 1);
-
-    // temporary buffer is allocated on the stack if the scratch pointer is NULL
     assert(scratch != nullptr);
+    assert(VALIGNED(finput) && VALIGNED(foutput) && VALIGNED(scratch));
 
-    const v4sf *vinput = reinterpret_cast<const v4sf*>(finput);
-    v4sf *voutput      = reinterpret_cast<v4sf*>(foutput);
-    v4sf *buff[2]      = { voutput, scratch };
-    int ib = (nf_odd ^ ordered ? 1 : 0);
+    const int Ncvec{setup->Ncvec};
+    const int nf_odd{setup->ifac[1] & 1};
 
-    assert(VALIGNED(finput) && VALIGNED(foutput));
+    auto *vinput = reinterpret_cast<const v4sf*>(finput);
+    auto *voutput = reinterpret_cast<v4sf*>(foutput);
+    assert(voutput != scratch);
 
-    //assert(finput != foutput);
+    v4sf *buff[2]{voutput, scratch};
+    int ib{(nf_odd ^ ordered) ? 1 : 0};
     if(direction == PFFFT_FORWARD)
     {
+        /* Swap the initial work buffer for forward FFTs, which helps avoid an
+         * extra copy for output.
+         */
         ib = !ib;
         if(setup->transform == PFFFT_REAL)
         {
-            ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
-            pffft_real_finalize(Ncvec, buff[ib], buff[!ib], reinterpret_cast<v4sf*>(setup->e));
+            ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
+            pffft_real_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
         }
         else
         {
-            v4sf *tmp = buff[ib];
+            v4sf *tmp{buff[ib]};
             for(int k=0; k < Ncvec; ++k)
-            {
                 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
-            }
-            ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, &setup->ifac[0], -1.0f) == buff[0] ? 0 : 1);
-            pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], reinterpret_cast<v4sf*>(setup->e));
+
+            ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
+            pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
         }
         if(ordered)
-            pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]), reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
+            pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]),
+                reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
         else
             ib = !ib;
     }
@@ -1828,23 +1816,22 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
 
         if(ordered)
         {
-            pffft_zreorder(setup, reinterpret_cast<const float*>(vinput), reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
+            pffft_zreorder(setup, reinterpret_cast<const float*>(vinput),
+                reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
             vinput = buff[ib];
             ib = !ib;
         }
         if(setup->transform == PFFFT_REAL)
         {
-            pffft_real_preprocess(Ncvec, vinput, buff[ib], reinterpret_cast<v4sf*>(setup->e));
-            ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+            pffft_real_preprocess(Ncvec, vinput, buff[ib], setup->e);
+            ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac) == buff[1]);
         }
         else
         {
-            pffft_cplx_preprocess(Ncvec, vinput, buff[ib], reinterpret_cast<v4sf*>(setup->e));
-            ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, &setup->ifac[0], +1.0f) == buff[0] ? 0 : 1);
-            for(int k=0; k < Ncvec; ++k)
-            {
+            pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e);
+            ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, setup->ifac, +1.0f) == buff[1]);
+            for(int k{0};k < Ncvec;++k)
                 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
-            }
         }
     }
 
@@ -1852,14 +1839,13 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
     {
         /* extra copy required -- this situation should only happen when finput == foutput */
         assert(finput==foutput);
-        for(int k=0; k < Ncvec; ++k)
+        for(int k{0};k < Ncvec;++k)
         {
-            v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
+            v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
             voutput[2*k] = a; voutput[2*k+1] = b;
         }
         ib = !ib;
     }
-    assert(buff[ib] == voutput);
 }
 
 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
@@ -1867,10 +1853,10 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 {
     assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
 
-    const int Ncvec = s->Ncvec;
-    const v4sf *RESTRICT va = reinterpret_cast<const v4sf*>(a);
-    const v4sf *RESTRICT vb = reinterpret_cast<const v4sf*>(b);
-    v4sf *RESTRICT vab = reinterpret_cast<v4sf*>(ab);
+    const int Ncvec{s->Ncvec};
+    const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
+    const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
+    v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
 
 #ifdef __arm__
     __builtin_prefetch(va);
@@ -1891,18 +1877,19 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 #endif
 
 #ifndef ZCONVOLVE_USING_INLINE_ASM
-    const v4sf vscal = LD_PS1(scaling);
+    const v4sf vscal{LD_PS1(scaling)};
 #endif
-    float ar1 = reinterpret_cast<const v4sf_union*>(va)[0].f[0];
-    float ai1 = reinterpret_cast<const v4sf_union*>(va)[1].f[0];
-    float br1 = reinterpret_cast<const v4sf_union*>(vb)[0].f[0];
-    float bi1 = reinterpret_cast<const v4sf_union*>(vb)[1].f[0];
-    float abr1 = reinterpret_cast<v4sf_union*>(vab)[0].f[0];
-    float abi1 = reinterpret_cast<v4sf_union*>(vab)[1].f[0];
+    /* TODO: Use _mm_cvtss_f32 or equivalent. */
+    float ar1{reinterpret_cast<const v4sf_union*>(va)[0].f[0]};
+    float ai1{reinterpret_cast<const v4sf_union*>(va)[1].f[0]};
+    float br1{reinterpret_cast<const v4sf_union*>(vb)[0].f[0]};
+    float bi1{reinterpret_cast<const v4sf_union*>(vb)[1].f[0]};
+    float abr1{reinterpret_cast<v4sf_union*>(vab)[0].f[0]};
+    float abi1{reinterpret_cast<v4sf_union*>(vab)[1].f[0]};
 
 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
-    const float *a_ = a, *b_ = b; float *ab_ = ab;
-    int N = Ncvec;
+    const float *a_{a}, *b_{b}; float *ab_{ab};
+    int N{Ncvec};
     asm volatile("mov         r8, %2                  \n"
                 "vdup.f32    q15, %4                 \n"
                 "1:                                  \n"
@@ -1941,9 +1928,8 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 
     for(int i=0; i < Ncvec; i += 2)
     {
-        v4sf ar4, ai4, br4, bi4;
-        ar4 = va[2*i+0]; ai4 = va[2*i+1];
-        br4 = vb[2*i+0]; bi4 = vb[2*i+1];
+        v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
+        v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
         VCPLXMUL(ar4, ai4, br4, bi4);
         vab[2*i+0] = VMADD(ar4, vscal, vab[2*i+0]);
         vab[2*i+1] = VMADD(ai4, vscal, vab[2*i+1]);
@@ -2000,22 +1986,24 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
     float *scratch, pffft_direction_t direction, int ordered)
 {
-    const int Ncvec  = setup->Ncvec;
-    const int nf_odd = (setup->ifac[1] & 1);
+    const int Ncvec{setup->Ncvec};
+    const int nf_odd{setup->ifac[1] & 1};
 
     assert(scratch != nullptr);
 
+    /* z-domain data for complex transforms is already ordered without SIMD. */
     if(setup->transform == PFFFT_COMPLEX)
-        ordered = 0; // it is always ordered.
-    int ib = (nf_odd ^ ordered ? 1 : 0);
-    float *buff[2] = { output, scratch };
+        ordered = 0;
+
+    float *buff[2]{output, scratch};
+    int ib{(nf_odd ^ ordered) ? 1 : 0};
 
     if(direction == PFFFT_FORWARD)
     {
         if(setup->transform == PFFFT_REAL)
-            ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+            ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
         else
-            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], -1.0f) == buff[0] ? 0 : 1);
+            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
         if(ordered)
         {
             pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
@@ -2034,9 +2022,9 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
             ib = !ib;
         }
         if(setup->transform == PFFFT_REAL)
-            ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],  setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+            ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],  setup->twiddle, setup->ifac) == buff[1]);
         else
-            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], +1.0f) == buff[0] ? 0 : 1);
+            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
     }
     if(buff[ib] != output)
     {
@@ -2049,7 +2037,6 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
         }
         ib = !ib;
     }
-    assert(buff[ib] == output);
 }
 
 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
-- 
cgit v1.2.3


From 63b8c6b9c025bac9acae8d74783dd8a45b2c06ad Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Sun, 8 Oct 2023 03:22:24 -0700
Subject: Clean up some more type-punning in pffft

---
 common/pffft.cpp | 64 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 22 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 06ae66ec..261e51db 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -103,8 +103,16 @@ typedef vector float v4sf;
 #define VADD(a,b) vec_add(a,b)
 #define VMADD(a,b,c) vec_madd(a,b,c)
 #define VSUB(a,b) vec_sub(a,b)
-inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
-#define LD_PS1(p) ld_ps1(&p)
+#define LD_PS1(p) vec_splats(p)
+inline v4sf vset4(float a, float b, float c, float d)
+{
+    /* There a more efficient way to do this? */
+    alignas(16) std::array<float,4> vals{{a, b, c, d}};
+    return vec_ld(0, vals.data());
+}
+#define VSET4 vset4
+#define VEXTRACT0(v) vec_extract((v), 0)
+/* vec_insert(v, value, idx), v[idx] = value */
 #define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } while(0)
 #define UNINTERLEAVE2(in1, in2, out1, out2) do {                           \
     vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
@@ -139,6 +147,8 @@ typedef __m128 v4sf;
 #define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
 #define VSUB(a,b) _mm_sub_ps(a,b)
 #define LD_PS1(p) _mm_set1_ps(p)
+#define VSET4 _mm_setr_ps
+#define VEXTRACT0 _mm_cvtss_f32
 #define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } while(0)
 #define UNINTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } while(0)
 #define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
@@ -159,6 +169,16 @@ typedef float32x4_t v4sf;
 #define VMADD(a,b,c) vmlaq_f32(c,a,b)
 #define VSUB(a,b) vsubq_f32(a,b)
 #define LD_PS1(p) vld1q_dup_f32(&(p))
+inline v4sf vset4(float a, float b, float c, float d)
+{
+    float32x4_t ret{vmovq_n_f32(a)};
+    ret = vsetq_lane_f32(b, ret, 1);
+    ret = vsetq_lane_f32(c, ret, 2);
+    ret = vsetq_lane_f32(d, ret, 3);
+    return ret;
+}
+#define VSET4 vset4
+#define VEXTRACT0(v) vgetq_lane_f32((v), 0)
 #define INTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
 #define UNINTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
 #define VTRANSPOSE4(x0,x1,x2,x3) do {                                   \
@@ -189,6 +209,8 @@ using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
 
 constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 #define LD_PS1 ld_ps1
+#define VSET4(a, b, c, d) v4sf{(a), (b), (c), (d)}
+#define VEXTRACT0(v) ((v)[0])
 
 [[gnu::always_inline]] inline v4sf unpacklo(v4sf a, v4sf b) noexcept
 { return v4sf{a[0], b[0], a[1], b[1]}; }
@@ -1734,9 +1756,8 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
     std::array<float,SIMD_SZ> Xr, Xi;
     for(size_t k{0};k < 4;++k)
     {
-        /* TODO: Use _mm_cvtss_f32 or equivalent. */
-        Xr[k] = al::bit_cast<std::array<float,SIMD_SZ>>(in[4*k])[0];
-        Xi[k] = al::bit_cast<std::array<float,SIMD_SZ>>(in[4*k + 1])[0];
+        Xr[k] = VEXTRACT0(in[4*k]);
+        Xi[k] = VEXTRACT0(in[4*k + 1]);
     }
 
     pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
@@ -1755,16 +1776,16 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
     for(int k{1};k < dk;++k)
         pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
 
-    /* TODO: Use _mm_set_ps or equivalent. */
-    auto *uout = reinterpret_cast<v4sf_union*>(out);
-    const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]}; uout[0].f[0] = cr0;
-    const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]}; uout[0].f[1] = cr1;
-    const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]}; uout[0].f[2] = cr2;
-    const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]}; uout[0].f[3] = cr3;
-    const float ci0{ 2*(Xr[1]+Xr[3])};                   uout[2*Ncvec-1].f[0] = ci0;
-    const float ci1{ s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; uout[2*Ncvec-1].f[1] = ci1;
-    const float ci2{ 2*(Xi[3]-Xi[1])};                   uout[2*Ncvec-1].f[2] = ci2;
-    const float ci3{-s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; uout[2*Ncvec-1].f[3] = ci3;
+    const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]};
+    const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]};
+    const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]};
+    const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]};
+    out[0] = VSET4(cr0, cr1, cr2, cr3);
+    const float ci0{ 2*(Xr[1]+Xr[3])};
+    const float ci1{ s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])};
+    const float ci2{ 2*(Xi[3]-Xi[1])};
+    const float ci3{-s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])};
+    out[2*Ncvec-1] = VSET4(ci0, ci1, ci2, ci3);
 }
 
 
@@ -1879,13 +1900,12 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 #ifndef ZCONVOLVE_USING_INLINE_ASM
     const v4sf vscal{LD_PS1(scaling)};
 #endif
-    /* TODO: Use _mm_cvtss_f32 or equivalent. */
-    float ar1{reinterpret_cast<const v4sf_union*>(va)[0].f[0]};
-    float ai1{reinterpret_cast<const v4sf_union*>(va)[1].f[0]};
-    float br1{reinterpret_cast<const v4sf_union*>(vb)[0].f[0]};
-    float bi1{reinterpret_cast<const v4sf_union*>(vb)[1].f[0]};
-    float abr1{reinterpret_cast<v4sf_union*>(vab)[0].f[0]};
-    float abi1{reinterpret_cast<v4sf_union*>(vab)[1].f[0]};
+    float ar1{VEXTRACT0(va[0])};
+    float ai1{VEXTRACT0(va[1])};
+    float br1{VEXTRACT0(vb[0])};
+    float bi1{VEXTRACT0(vb[1])};
+    float abr1{VEXTRACT0(vab[0])};
+    float abi1{VEXTRACT0(vab[1])};
 
 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
     const float *a_{a}, *b_{b}; float *ab_{ab};
-- 
cgit v1.2.3


From e805ab9258bea2162bab71e71a2eeddb2be28eeb Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Sun, 8 Oct 2023 20:03:07 -0700
Subject: Remove more type-punning from pffft

---
 common/pffft.cpp | 115 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 63 insertions(+), 52 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 261e51db..883e44f0 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -60,7 +60,7 @@
 #include <array>
 #include <assert.h>
 #include <cmath>
-#include <math.h>
+#include <cstring>
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
@@ -80,6 +80,12 @@
 #define ALWAYS_INLINE(return_type) __forceinline return_type
 #define NEVER_INLINE(return_type) __declspec(noinline) return_type
 #define RESTRICT __restrict
+
+#else
+
+#define ALWAYS_INLINE(return_type) inline return_type
+#define NEVER_INLINE(return_type) return_type
+#define RESTRICT
 #endif
 
 
@@ -111,8 +117,8 @@ inline v4sf vset4(float a, float b, float c, float d)
     return vec_ld(0, vals.data());
 }
 #define VSET4 vset4
+#define VINSERT0(v, a) vec_insert((a), (v), 0)
 #define VEXTRACT0(v) vec_extract((v), 0)
-/* vec_insert(v, value, idx), v[idx] = value */
 #define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } while(0)
 #define UNINTERLEAVE2(in1, in2, out1, out2) do {                           \
     vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
@@ -148,6 +154,7 @@ typedef __m128 v4sf;
 #define VSUB(a,b) _mm_sub_ps(a,b)
 #define LD_PS1(p) _mm_set1_ps(p)
 #define VSET4 _mm_setr_ps
+#define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a))
 #define VEXTRACT0 _mm_cvtss_f32
 #define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } while(0)
 #define UNINTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } while(0)
@@ -178,6 +185,7 @@ inline v4sf vset4(float a, float b, float c, float d)
     return ret;
 }
 #define VSET4 vset4
+#define VINSERT0(v, a) vsetq_lane_f32((a), (v), 0)
 #define VEXTRACT0(v) vgetq_lane_f32((v), 0)
 #define INTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
 #define UNINTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
@@ -210,6 +218,9 @@ using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
 constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 #define LD_PS1 ld_ps1
 #define VSET4(a, b, c, d) v4sf{(a), (b), (c), (d)}
+[[gnu::always_inline]] inline v4sf vinsert0(v4sf v, float a) noexcept
+{ return v4sf{a, v[1], v[2], v[3]}; }
+#define VINSERT0 vinsert0
 #define VEXTRACT0(v) ((v)[0])
 
 [[gnu::always_inline]] inline v4sf unpacklo(v4sf a, v4sf b) noexcept
@@ -235,10 +246,10 @@ constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 
 [[gnu::always_inline]] inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 {
-    v4sf tmp0 = unpacklo(x0, x1);
-    v4sf tmp2 = unpacklo(x2, x3);
-    v4sf tmp1 = unpackhi(x0, x1);
-    v4sf tmp3 = unpackhi(x2, x3);
+    v4sf tmp0{unpacklo(x0, x1)};
+    v4sf tmp2{unpacklo(x2, x3)};
+    v4sf tmp1{unpackhi(x0, x1)};
+    v4sf tmp3{unpackhi(x2, x3)};
     x0 = v4sf{tmp0[0], tmp0[1], tmp2[0], tmp2[1]};
     x1 = v4sf{tmp0[2], tmp0[3], tmp2[2], tmp2[3]};
     x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
@@ -282,52 +293,53 @@ typedef float v4sf;
 #endif
 
 #if !defined(PFFFT_SIMD_DISABLE)
-/* TODO: Remove this, type-punning to access individual SIMD values is bad. */
-typedef union v4sf_union {
-    v4sf  v;
-    float f[4];
-} v4sf_union;
-
-#include <string.h>
 
-#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+#define assertv4(v,f0,f1,f2,f3) assert(v##_f[0] == (f0) && v##_f[1] == (f1) && v##_f[2] == (f2) && v##_f[3] == (f3))
 
 /* detect bugs with the vector support macros */
 void validate_pffft_simd()
 {
-    float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
-    v4sf_union a0, a1, a2, a3, t, u;
-    memcpy(a0.f, f, 4*sizeof(float));
-    memcpy(a1.f, f+4, 4*sizeof(float));
-    memcpy(a2.f, f+8, 4*sizeof(float));
-    memcpy(a3.f, f+12, 4*sizeof(float));
-
-    t = a0; u = a1; t.v = VZERO();
-    printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
-    t.v = VADD(a1.v, a2.v);
-    printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
-    t.v = VMUL(a1.v, a2.v);
-    printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
-    t.v = VMADD(a1.v, a2.v,a0.v);
-    printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
-
-    INTERLEAVE2(a1.v,a2.v,t.v,u.v);
-    printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+    using float4 = std::array<float,4>;
+    static constexpr float f[16]{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+
+    float4 a0_f, a1_f, a2_f, a3_f, t_f, u_f;
+    v4sf a0_v, a1_v, a2_v, a3_v, t_v, u_v;
+    std::memcpy(&a0_v, f, 4*sizeof(float));
+    std::memcpy(&a1_v, f+4, 4*sizeof(float));
+    std::memcpy(&a2_v, f+8, 4*sizeof(float));
+    std::memcpy(&a3_v, f+12, 4*sizeof(float));
+
+    t_v = a0_v; u_v = a1_v; t_v = VZERO();
+    t_f = al::bit_cast<float4>(t_v);
+    printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 0, 0, 0, 0);
+    t_v = VADD(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
+    printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 12, 14, 16, 18);
+    t_v = VMUL(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
+    printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 32, 45, 60, 77);
+    t_v = VMADD(a1_v, a2_v,a0_v); t_f = al::bit_cast<float4>(t_v);
+    printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 32, 46, 62, 80);
+
+    INTERLEAVE2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
+    printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
     assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
-    UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
-    printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+    UNINTERLEAVE2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
+    printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
     assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
 
-    t.v=LD_PS1(f[15]);
-    printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+    t_v=LD_PS1(f[15]); t_f = al::bit_cast<float4>(t_v);
+    printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
     assertv4(t, 15, 15, 15, 15);
-    t.v = VSWAPHL(a1.v, a2.v);
-    printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+    t_v = VSWAPHL(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
+    printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
     assertv4(t, 8, 9, 6, 7);
-    VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+    VTRANSPOSE4(a0_v, a1_v, a2_v, a3_v);
+    a0_f = al::bit_cast<float4>(a0_v);
+    a1_f = al::bit_cast<float4>(a1_v);
+    a2_f = al::bit_cast<float4>(a2_v);
+    a3_f = al::bit_cast<float4>(a3_v);
     printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
-          a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
-          a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
+          a0_f[0], a0_f[1], a0_f[2], a0_f[3], a1_f[0], a1_f[1], a1_f[2], a1_f[3],
+          a2_f[0], a2_f[1], a2_f[2], a2_f[3], a3_f[0], a3_f[1], a3_f[2], a3_f[3]);
     assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
 }
 #endif //!PFFFT_SIMD_DISABLE
@@ -1680,15 +1692,14 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
      * [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
      */
 
-    auto *uout = reinterpret_cast<v4sf_union*>(out);
-    const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; uout[0].f[0] = xr0;
-    const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; uout[1].f[0] = xi0;
-    const float xr2{(cr[0]-cr[2])};                 uout[4].f[0] = xr2;
-    const float xi2{(cr[3]-cr[1])};                 uout[5].f[0] = xi2;
-    const float xr1{ ci[0] + s*(ci[1]-ci[3])};      uout[2].f[0] = xr1;
-    const float xi1{-ci[2] - s*(ci[1]+ci[3])};      uout[3].f[0] = xi1;
-    const float xr3{ ci[0] - s*(ci[1]-ci[3])};      uout[6].f[0] = xr3;
-    const float xi3{ ci[2] - s*(ci[1]+ci[3])};      uout[7].f[0] = xi3;
+    const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; out[0] = VINSERT0(out[0], xr0);
+    const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; out[1] = VINSERT0(out[1], xi0);
+    const float xr2{(cr[0]-cr[2])};                 out[4] = VINSERT0(out[4], xr2);
+    const float xi2{(cr[3]-cr[1])};                 out[5] = VINSERT0(out[5], xi2);
+    const float xr1{ ci[0] + s*(ci[1]-ci[3])};      out[2] = VINSERT0(out[2], xr1);
+    const float xi1{-ci[2] - s*(ci[1]+ci[3])};      out[3] = VINSERT0(out[3], xi1);
+    const float xr3{ ci[0] - s*(ci[1]-ci[3])};      out[6] = VINSERT0(out[6], xr3);
+    const float xi3{ ci[2] - s*(ci[1]+ci[3])};      out[7] = VINSERT0(out[7], xi3);
 
     for(int k{1};k < dk;++k)
         pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
@@ -1963,8 +1974,8 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 
     if(s->transform == PFFFT_REAL)
     {
-        reinterpret_cast<v4sf_union*>(vab)[0].f[0] = abr1 + ar1*br1*scaling;
-        reinterpret_cast<v4sf_union*>(vab)[1].f[0] = abi1 + ai1*bi1*scaling;
+        vab[0] = VINSERT0(vab[0], abr1 + ar1*br1*scaling);
+        vab[1] = VINSERT0(vab[1], abi1 + ai1*bi1*scaling);
     }
 }
 
-- 
cgit v1.2.3


From 60ed9ec8bad22cc904ff0dec9b6d7dfe3c704e56 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Mon, 9 Oct 2023 01:29:14 -0700
Subject: Cleanup PFFFT

Make stylization more consistent.
Remove SVMUL (they all simulated it with a LD_PS1 on the scalar).
Avoid calling LD_PS1 on the same value in a loop.
---
 common/pffft.cpp | 1175 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 604 insertions(+), 571 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 883e44f0..8eb5a19b 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -68,6 +68,7 @@
 #include "albit.h"
 #include "almalloc.h"
 #include "alnumbers.h"
+#include "opthelpers.h"
 #include "vector.h"
 
 #if defined(__GNUC__)
@@ -94,7 +95,7 @@
  * vectors should be limited to these macros
  */
 
-// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
+/* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */
 //#define PFFFT_SIMD_DISABLE
 
 #ifndef PFFFT_SIMD_DISABLE
@@ -147,18 +148,18 @@ inline v4sf vset4(float a, float b, float c, float d)
 #include <xmmintrin.h>
 typedef __m128 v4sf;
 #define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
-#define VZERO() _mm_setzero_ps()
-#define VMUL(a,b) _mm_mul_ps(a,b)
-#define VADD(a,b) _mm_add_ps(a,b)
+#define VZERO _mm_setzero_ps
+#define VMUL _mm_mul_ps
+#define VADD _mm_add_ps
 #define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
-#define VSUB(a,b) _mm_sub_ps(a,b)
-#define LD_PS1(p) _mm_set1_ps(p)
+#define VSUB _mm_sub_ps
+#define LD_PS1 _mm_set1_ps
 #define VSET4 _mm_setr_ps
 #define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a))
 #define VEXTRACT0 _mm_cvtss_f32
 #define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } while(0)
 #define UNINTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } while(0)
-#define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#define VTRANSPOSE4 _MM_TRANSPOSE4_PS
 #define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
 #define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
 
@@ -171,11 +172,11 @@ typedef __m128 v4sf;
 typedef float32x4_t v4sf;
 #define SIMD_SZ 4
 #define VZERO() vdupq_n_f32(0)
-#define VMUL(a,b) vmulq_f32(a,b)
-#define VADD(a,b) vaddq_f32(a,b)
+#define VMUL vmulq_f32
+#define VADD vaddq_f32
 #define VMADD(a,b,c) vmlaq_f32(c,a,b)
-#define VSUB(a,b) vsubq_f32(a,b)
-#define LD_PS1(p) vld1q_dup_f32(&(p))
+#define VSUB vsubq_f32
+#define LD_PS1 vdupq_n_f32
 inline v4sf vset4(float a, float b, float c, float d)
 {
     float32x4_t ret{vmovq_n_f32(a)};
@@ -213,7 +214,6 @@ using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
 #define VADD(a,b) ((a) + (b))
 #define VMADD(a,b,c) ((a)*(b) + (c))
 #define VSUB(a,b) ((a) - (b))
-#define SVMUL(f,v) ((f) * (v))
 
 constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 #define LD_PS1 ld_ps1
@@ -287,10 +287,6 @@ typedef float v4sf;
 // shortcuts for complex multiplications
 #define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0)
 #define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0)
-#ifndef SVMUL
-// multiply a scalar with a vector
-#define SVMUL(f,v) VMUL(LD_PS1(f),v)
-#endif
 
 #if !defined(PFFFT_SIMD_DISABLE)
 
@@ -309,8 +305,8 @@ void validate_pffft_simd()
     std::memcpy(&a2_v, f+8, 4*sizeof(float));
     std::memcpy(&a3_v, f+12, 4*sizeof(float));
 
-    t_v = a0_v; u_v = a1_v; t_v = VZERO();
-    t_f = al::bit_cast<float4>(t_v);
+    t_v = a0_v; u_v = a1_v;
+    t_v = VZERO(); t_f = al::bit_cast<float4>(t_v);
     printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 0, 0, 0, 0);
     t_v = VADD(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
     printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 12, 14, 16, 18);
@@ -357,12 +353,13 @@ int pffft_simd_size() { return SIMD_SZ; }
 /*
   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
 */
-static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign)
+static NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+    const float *wa1, const float fsign)
 {
-    const int l1ido = l1*ido;
+    const int l1ido{l1*ido};
     if(ido <= 2)
     {
-        for(int k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido)
+        for(int k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
         {
             ch[0]         = VADD(cc[0], cc[ido+0]);
             ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
@@ -372,13 +369,14 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
     }
     else
     {
-        for(int k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido)
+        const v4sf vsign{LD_PS1(fsign)};
+        for(int k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
         {
-            for(int i=0; i<ido-1; i+=2)
+            for(int i{0};i < ido-1;i += 2)
             {
-                v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
-                v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
-                v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
+                v4sf tr2{VSUB(cc[i+0], cc[i+ido+0])};
+                v4sf ti2{VSUB(cc[i+1], cc[i+ido+1])};
+                v4sf wr{LD_PS1(wa1[i])}, wi{VMUL(vsign, LD_PS1(wa1[i+1]))};
                 ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
                 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
                 VCPLXMUL(tr2, ti2, wr, wi);
@@ -392,30 +390,31 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
 /*
   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
 */
-static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
-    const float *wa2, float fsign)
+static NEVER_INLINE(void) passf3_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+    const float *wa1, const float *wa2, const float fsign)
 {
-    static constexpr float taur = -0.5f;
-    const float taui = 0.866025403784439f*fsign;
-    const int l1ido = l1*ido;
     assert(ido > 2);
-    for(int k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido)
+
+    const v4sf vtaur{LD_PS1(-0.5f)};
+    const v4sf vtaui{LD_PS1(0.866025403784439f*fsign)};
+    const int l1ido{l1*ido};
+    for(int k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
     {
-        for(int i=0; i<ido-1; i+=2)
+        for(int i{0};i < ido-1;i += 2)
         {
-            v4sf tr2 = VADD(cc[i+ido], cc[i+2*ido]);
-            v4sf cr2 = VADD(cc[i], SVMUL(taur,tr2));
-            ch[i]    = VADD(cc[i], tr2);
-            v4sf ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
-            v4sf ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
-            ch[i+1]  = VADD(cc[i+1], ti2);
-            v4sf cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
-            v4sf ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
-            v4sf dr2 = VSUB(cr2, ci3);
-            v4sf dr3 = VADD(cr2, ci3);
-            v4sf di2 = VADD(ci2, cr3);
-            v4sf di3 = VSUB(ci2, cr3);
-            float wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+            v4sf tr2{VADD(cc[i+ido], cc[i+2*ido])};
+            v4sf cr2{VADD(cc[i], VMUL(vtaur,tr2))};
+            ch[i]  = VADD(cc[i], tr2);
+            v4sf ti2{VADD(cc[i+ido+1], cc[i+2*ido+1])};
+            v4sf ci2{VADD(cc[i    +1], VMUL(vtaur,ti2))};
+            ch[i+1] = VADD(cc[i+1], ti2);
+            v4sf cr3{VMUL(vtaui, VSUB(cc[i+ido], cc[i+2*ido]))};
+            v4sf ci3{VMUL(vtaui, VSUB(cc[i+ido+1], cc[i+2*ido+1]))};
+            v4sf dr2{VSUB(cr2, ci3)};
+            v4sf dr3{VADD(cr2, ci3)};
+            v4sf di2{VADD(ci2, cr3)};
+            v4sf di3{VSUB(ci2, cr3)};
+            float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
             VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
             ch[i+l1ido] = dr2;
             ch[i+l1ido + 1] = di2;
@@ -426,23 +425,24 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
     }
 } /* passf3 */
 
-static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
-    const float *wa2, const float *wa3, float fsign)
+static NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+    const float *wa1, const float *wa2, const float *wa3, const float fsign)
 {
-    /* isign == -1 for forward transform and +1 for backward transform */
-    const int l1ido = l1*ido;
+    /* fsign == -1 for forward transform and +1 for backward transform */
+    const v4sf vsign{LD_PS1(fsign)};
+    const int l1ido{l1*ido};
     if(ido == 2)
     {
-        for(int k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido)
+        for(int k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido)
         {
-            v4sf tr1 = VSUB(cc[0], cc[2*ido + 0]);
-            v4sf tr2 = VADD(cc[0], cc[2*ido + 0]);
-            v4sf ti1 = VSUB(cc[1], cc[2*ido + 1]);
-            v4sf ti2 = VADD(cc[1], cc[2*ido + 1]);
-            v4sf ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
-            v4sf tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
-            v4sf tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
-            v4sf ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
+            v4sf tr1{VSUB(cc[0], cc[2*ido + 0])};
+            v4sf tr2{VADD(cc[0], cc[2*ido + 0])};
+            v4sf ti1{VSUB(cc[1], cc[2*ido + 1])};
+            v4sf ti2{VADD(cc[1], cc[2*ido + 1])};
+            v4sf ti4{VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), vsign)};
+            v4sf tr4{VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), vsign)};
+            v4sf tr3{VADD(cc[ido + 0], cc[3*ido + 0])};
+            v4sf ti3{VADD(cc[ido + 1], cc[3*ido + 1])};
 
             ch[0*l1ido + 0] = VADD(tr2, tr3);
             ch[0*l1ido + 1] = VADD(ti2, ti3);
@@ -456,36 +456,36 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
     }
     else
     {
-        for(int k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido)
+        for(int k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido)
         {
-            for(int i=0; i<ido-1; i+=2)
+            for(int i{0};i < ido-1;i+=2)
             {
-                v4sf tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
-                v4sf tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
-                v4sf ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
-                v4sf ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
-                v4sf tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
-                v4sf ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
-                v4sf tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
-                v4sf ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
+                v4sf tr1{VSUB(cc[i + 0], cc[i + 2*ido + 0])};
+                v4sf tr2{VADD(cc[i + 0], cc[i + 2*ido + 0])};
+                v4sf ti1{VSUB(cc[i + 1], cc[i + 2*ido + 1])};
+                v4sf ti2{VADD(cc[i + 1], cc[i + 2*ido + 1])};
+                v4sf tr4{VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), vsign)};
+                v4sf ti4{VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), vsign)};
+                v4sf tr3{VADD(cc[i + ido + 0], cc[i + 3*ido + 0])};
+                v4sf ti3{VADD(cc[i + ido + 1], cc[i + 3*ido + 1])};
 
                 ch[i] = VADD(tr2, tr3);
-                v4sf cr3 = VSUB(tr2, tr3);
+                v4sf cr3{VSUB(tr2, tr3)};
                 ch[i + 1] = VADD(ti2, ti3);
-                v4sf ci3 = VSUB(ti2, ti3);
+                v4sf ci3{VSUB(ti2, ti3)};
 
-                v4sf cr2 = VADD(tr1, tr4);
-                v4sf cr4 = VSUB(tr1, tr4);
-                v4sf ci2 = VADD(ti1, ti4);
-                v4sf ci4 = VSUB(ti1, ti4);
-                float wr1=wa1[i], wi1=fsign*wa1[i+1];
+                v4sf cr2{VADD(tr1, tr4)};
+                v4sf cr4{VSUB(tr1, tr4)};
+                v4sf ci2{VADD(ti1, ti4)};
+                v4sf ci4{VSUB(ti1, ti4)};
+                float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
                 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
-                float wr2=wa2[i], wi2=fsign*wa2[i+1];
+                float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
                 ch[i + l1ido] = cr2;
                 ch[i + l1ido + 1] = ci2;
 
                 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
-                float wr3=wa3[i], wi3=fsign*wa3[i+1];
+                float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
                 ch[i + 2*l1ido] = cr3;
                 ch[i + 2*l1ido + 1] = ci3;
 
@@ -500,50 +500,50 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
 /*
  * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
  */
-static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
-    const float *wa2, const float *wa3, const float *wa4, float fsign)
+static NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+    const float *wa1, const float *wa2, const float *wa3, const float *wa4, const float fsign)
 {
-    static constexpr float tr11 = 0.309016994374947f;
-    static constexpr float tr12 = -0.809016994374947f;
-    const float ti11 = 0.951056516295154f*fsign;
-    const float ti12 = 0.587785252292473f*fsign;
+    const v4sf vtr11{LD_PS1(0.309016994374947f)};
+    const v4sf vtr12{LD_PS1(-0.809016994374947f)};
+    const v4sf vti11{LD_PS1(0.951056516295154f*fsign)};
+    const v4sf vti12{LD_PS1(0.587785252292473f*fsign)};
 
 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + (a_1) + 1]
 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + (a_1) + 1]
 
     assert(ido > 2);
-    for(int k = 0; k < l1; ++k, cc += 5*ido, ch += ido)
+    for(int k{0};k < l1;++k, cc += 5*ido, ch += ido)
     {
-        for(int i = 0; i < ido-1; i += 2)
+        for(int i{0};i < ido-1;i += 2)
         {
-            v4sf ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
-            v4sf ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
-            v4sf ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
-            v4sf ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
-            v4sf tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
-            v4sf tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
-            v4sf tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
-            v4sf tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
+            v4sf ti5{VSUB(cc_ref(i  , 2), cc_ref(i  , 5))};
+            v4sf ti2{VADD(cc_ref(i  , 2), cc_ref(i  , 5))};
+            v4sf ti4{VSUB(cc_ref(i  , 3), cc_ref(i  , 4))};
+            v4sf ti3{VADD(cc_ref(i  , 3), cc_ref(i  , 4))};
+            v4sf tr5{VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5))};
+            v4sf tr2{VADD(cc_ref(i-1, 2), cc_ref(i-1, 5))};
+            v4sf tr4{VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4))};
+            v4sf tr3{VADD(cc_ref(i-1, 3), cc_ref(i-1, 4))};
             ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
             ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
-            v4sf cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
-            v4sf ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
-            v4sf cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
-            v4sf ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
-            v4sf cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
-            v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
-            v4sf cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
-            v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
-            v4sf dr3 = VSUB(cr3, ci4);
-            v4sf dr4 = VADD(cr3, ci4);
-            v4sf di3 = VADD(ci3, cr4);
-            v4sf di4 = VSUB(ci3, cr4);
-            v4sf dr5 = VADD(cr2, ci5);
-            v4sf dr2 = VSUB(cr2, ci5);
-            v4sf di5 = VSUB(ci2, cr5);
-            v4sf di2 = VADD(ci2, cr5);
-            float wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
-            float wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
+            v4sf cr2{VADD(cc_ref(i-1, 1), VADD(VMUL(vtr11, tr2),VMUL(vtr12, tr3)))};
+            v4sf ci2{VADD(cc_ref(i  , 1), VADD(VMUL(vtr11, ti2),VMUL(vtr12, ti3)))};
+            v4sf cr3{VADD(cc_ref(i-1, 1), VADD(VMUL(vtr12, tr2),VMUL(vtr11, tr3)))};
+            v4sf ci3{VADD(cc_ref(i  , 1), VADD(VMUL(vtr12, ti2),VMUL(vtr11, ti3)))};
+            v4sf cr5{VADD(VMUL(vti11, tr5), VMUL(vti12, tr4))};
+            v4sf ci5{VADD(VMUL(vti11, ti5), VMUL(vti12, ti4))};
+            v4sf cr4{VSUB(VMUL(vti12, tr5), VMUL(vti11, tr4))};
+            v4sf ci4{VSUB(VMUL(vti12, ti5), VMUL(vti11, ti4))};
+            v4sf dr3{VSUB(cr3, ci4)};
+            v4sf dr4{VADD(cr3, ci4)};
+            v4sf di3{VADD(ci3, cr4)};
+            v4sf di4{VSUB(ci3, cr4)};
+            v4sf dr5{VADD(cr2, ci5)};
+            v4sf dr2{VSUB(cr2, ci5)};
+            v4sf di5{VSUB(ci2, cr5)};
+            v4sf di2{VADD(ci2, cr5)};
+            float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
+            float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
             VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
             ch_ref(i - 1, 2) = dr2;
             ch_ref(i, 2)     = di2;
@@ -562,15 +562,13 @@ static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
 #undef cc_ref
 }
 
-static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
-    const float *wa1)
+static NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
+    v4sf *RESTRICT ch, const float *wa1)
 {
-    static constexpr float minus_one = -1.f;
-    const int l1ido = l1*ido;
-
-    for(int k=0; k < l1ido; k += ido)
+    const int l1ido{l1*ido};
+    for(int k{0};k < l1ido;k += ido)
     {
-        v4sf a = cc[k], b = cc[k + l1ido];
+        v4sf a{cc[k]}, b{cc[k + l1ido]};
         ch[2*k] = VADD(a, b);
         ch[2*(k+ido)-1] = VSUB(a, b);
     }
@@ -578,12 +576,12 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
         return;
     if(ido != 2)
     {
-        for(int k=0; k < l1ido; k += ido)
+        for(int k{0};k < l1ido;k += ido)
         {
-            for(int i=2; i<ido; i+=2)
+            for(int i{2};i < ido;i += 2)
             {
-                v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
-                v4sf br = cc[i - 1 + k], bi = cc[i + k];
+                v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
+                v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
                 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
                 ch[i + 2*k] = VADD(bi, ti2);
                 ch[2*(k+ido) - i] = VSUB(ti2, bi);
@@ -594,41 +592,42 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
         if((ido&1) == 1)
             return;
     }
-    for(int k=0; k < l1ido; k += ido)
+    const v4sf minus_one{LD_PS1(-1.0f)};
+    for(int k{0};k < l1ido;k += ido)
     {
-        ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
+        ch[2*k + ido] = VMUL(minus_one, cc[ido-1 + k + l1ido]);
         ch[2*k + ido-1] = cc[k + ido-1];
     }
 } /* radf2 */
 
 
-static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1)
+static NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+    const float *wa1)
 {
-    static constexpr float minus_two=-2;
-    const int l1ido = l1*ido;
-    for(int k=0; k < l1ido; k += ido)
+    const int l1ido{l1*ido};
+    for(int k{0};k < l1ido;k += ido)
     {
-        v4sf a = cc[2*k];
-        v4sf b = cc[2*(k+ido) - 1];
+        v4sf a{cc[2*k]};
+        v4sf b{cc[2*(k+ido) - 1]};
         ch[k] = VADD(a, b);
-        ch[k + l1ido] =VSUB(a, b);
+        ch[k + l1ido] = VSUB(a, b);
     }
     if(ido < 2)
         return;
     if(ido != 2)
     {
-        for(int k = 0; k < l1ido; k += ido)
+        for(int k{0};k < l1ido;k += ido)
         {
-            for(int i = 2; i < ido; i += 2)
+            for(int i{2};i < ido;i += 2)
             {
-                v4sf a = cc[i-1 + 2*k];
-                v4sf b = cc[2*(k + ido) - i - 1];
-                v4sf c = cc[i+0 + 2*k];
-                v4sf d = cc[2*(k + ido) - i + 0];
+                v4sf a{cc[i-1 + 2*k]};
+                v4sf b{cc[2*(k + ido) - i - 1]};
+                v4sf c{cc[i+0 + 2*k]};
+                v4sf d{cc[2*(k + ido) - i + 0]};
                 ch[i-1 + k] = VADD(a, b);
-                v4sf tr2 = VSUB(a, b);
+                v4sf tr2{VSUB(a, b)};
                 ch[i+0 + k] = VSUB(c, d);
-                v4sf ti2 = VADD(c, d);
+                v4sf ti2{VADD(c, d)};
                 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
                 ch[i-1 + k + l1ido] = tr2;
                 ch[i+0 + k + l1ido] = ti2;
@@ -637,54 +636,55 @@ static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, co
         if((ido&1) == 1)
             return;
     }
-    for(int k = 0; k < l1ido; k += ido)
+    const v4sf minus_two{LD_PS1(-2.0f)};
+    for(int k{0};k < l1ido;k += ido)
     {
-        v4sf a = cc[2*k + ido-1];
-        v4sf b = cc[2*k + ido];
+        v4sf a{cc[2*k + ido-1]};
+        v4sf b{cc[2*k + ido]};
         ch[k + ido-1] = VADD(a,a);
-        ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
+        ch[k + ido-1 + l1ido] = VMUL(minus_two, b);
     }
 } /* radb2 */
 
-static void radf3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
-    const float *wa2)
+static void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+    const float *wa1, const float *wa2)
 {
-    static constexpr float taur = -0.5f;
-    static constexpr float taui = 0.866025403784439f;
-    for(int k=0; k<l1; k++)
+    const v4sf vtaur{LD_PS1(-0.5f)};
+    const v4sf vtaui{LD_PS1(0.866025403784439f)};
+    for(int k{0};k < l1;++k)
     {
-        v4sf cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
+        v4sf cr2{VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
         ch[3*k*ido] = VADD(cc[k*ido], cr2);
-        ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
-        ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
+        ch[(3*k+2)*ido] = VMUL(vtaui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+        ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(vtaur, cr2));
     }
     if(ido == 1)
         return;
-    for(int k=0; k<l1; k++)
+    for(int k{0};k < l1;++k)
     {
-        for(int i=2; i<ido; i+=2)
+        for(int i{2};i < ido;i += 2)
         {
-            const int ic = ido - i;
-            v4sf wr1 = LD_PS1(wa1[i - 2]);
-            v4sf wi1 = LD_PS1(wa1[i - 1]);
-            v4sf dr2 = cc[i - 1 + (k + l1)*ido];
-            v4sf di2 = cc[i + (k + l1)*ido];
+            const int ic{ido - i};
+            v4sf wr1{LD_PS1(wa1[i - 2])};
+            v4sf wi1{LD_PS1(wa1[i - 1])};
+            v4sf dr2{cc[i - 1 + (k + l1)*ido]};
+            v4sf di2{cc[i + (k + l1)*ido]};
             VCPLXMULCONJ(dr2, di2, wr1, wi1);
 
-            v4sf wr2 = LD_PS1(wa2[i - 2]);
-            v4sf wi2 = LD_PS1(wa2[i - 1]);
-            v4sf dr3 = cc[i - 1 + (k + l1*2)*ido];
-            v4sf di3 = cc[i + (k + l1*2)*ido];
+            v4sf wr2{LD_PS1(wa2[i - 2])};
+            v4sf wi2{LD_PS1(wa2[i - 1])};
+            v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
+            v4sf di3{cc[i + (k + l1*2)*ido]};
             VCPLXMULCONJ(dr3, di3, wr2, wi2);
 
-            v4sf cr2 = VADD(dr2, dr3);
-            v4sf ci2 = VADD(di2, di3);
+            v4sf cr2{VADD(dr2, dr3)};
+            v4sf ci2{VADD(di2, di3)};
             ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
             ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
-            v4sf tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
-            v4sf ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
-            v4sf tr3 = SVMUL(taui, VSUB(di2, di3));
-            v4sf ti3 = SVMUL(taui, VSUB(dr3, dr2));
+            v4sf tr2{VADD(cc[i - 1 + k*ido], VMUL(vtaur, cr2))};
+            v4sf ti2{VADD(cc[i + k*ido], VMUL(vtaur, ci2))};
+            v4sf tr3{VMUL(vtaui, VSUB(di2, di3))};
+            v4sf ti3{VMUL(vtaui, VSUB(dr3, dr2))};
             ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
             ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
             ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
@@ -697,39 +697,42 @@ static void radf3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
 static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
     const float *wa2)
 {
-    static constexpr float taur = -0.5f;
-    static constexpr float taui = 0.866025403784439f;
-    static constexpr float taui_2 = taui*2.0f;
+    static constexpr float taur{-0.5f};
+    static constexpr float taui{0.866025403784439f};
+    static constexpr float taui_2{taui*2.0f};
 
-    for(int k=0; k<l1; k++)
+    const v4sf vtaur{LD_PS1(taur)};
+    const v4sf vtaui_2{LD_PS1(taui_2)};
+    for(int k{0};k < l1;++k)
     {
         v4sf tr2 = cc[ido-1 + (3*k + 1)*ido];
         tr2 = VADD(tr2,tr2);
-        v4sf cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
+        v4sf cr2 = VMADD(vtaur, tr2, cc[3*k*ido]);
         ch[k*ido] = VADD(cc[3*k*ido], tr2);
-        v4sf ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
+        v4sf ci3 = VMUL(vtaui_2, cc[(3*k + 2)*ido]);
         ch[(k + l1)*ido] = VSUB(cr2, ci3);
         ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
     }
     if(ido == 1)
         return;
-    for(int k=0; k<l1; k++)
+    const v4sf vtaui{LD_PS1(taui)};
+    for(int k{0};k < l1;++k)
     {
-        for(int i=2; i<ido; i+=2)
+        for(int i{2};i < ido;i += 2)
         {
-            const int ic = ido - i;
-            v4sf tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
-            v4sf cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
+            const int ic{ido - i};
+            v4sf tr2{VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])};
+            v4sf cr2{VMADD(vtaur, tr2, cc[i - 1 + 3*k*ido])};
             ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
-            v4sf ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
-            v4sf ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
+            v4sf ti2{VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])};
+            v4sf ci2{VMADD(vtaur, ti2, cc[i + 3*k*ido])};
             ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
-            v4sf cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
-            v4sf ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
-            v4sf dr2 = VSUB(cr2, ci3);
-            v4sf dr3 = VADD(cr2, ci3);
-            v4sf di2 = VADD(ci2, cr3);
-            v4sf di3 = VSUB(ci2, cr3);
+            v4sf cr3{VMUL(vtaui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]))};
+            v4sf ci3{VMUL(vtaui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]))};
+            v4sf dr2{VSUB(cr2, ci3)};
+            v4sf dr3{VADD(cr2, ci3)};
+            v4sf di2{VADD(ci2, cr3)};
+            v4sf di3{VSUB(ci2, cr3)};
             VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
             ch[i - 1 + (k + l1)*ido] = dr2;
             ch[i + (k + l1)*ido] = di2;
@@ -743,18 +746,17 @@ static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
 static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
         const float *RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
 {
-    static constexpr float minus_hsqt2 = al::numbers::sqrt2_v<float> * -0.5f;
-    const int l1ido = l1*ido;
+    const int l1ido{l1*ido};
     {
         const v4sf *RESTRICT cc_ = cc, *RESTRICT cc_end = cc + l1ido;
         v4sf *RESTRICT ch_ = ch;
         while(cc != cc_end)
         {
             // this loop represents between 25% and 40% of total radf4_ps cost !
-            v4sf a0 = cc[0], a1 = cc[l1ido];
-            v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
-            v4sf tr1 = VADD(a1, a3);
-            v4sf tr2 = VADD(a0, a2);
+            v4sf a0{cc[0]}, a1{cc[l1ido]};
+            v4sf a2{cc[2*l1ido]}, a3{cc[3*l1ido]};
+            v4sf tr1{VADD(a1, a3)};
+            v4sf tr2{VADD(a0, a2)};
             ch[2*ido-1] = VSUB(a0, a2);
             ch[2*ido  ] = VSUB(a3, a1);
             ch[0      ] = VADD(tr1, tr2);
@@ -768,47 +770,45 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
         return;
     if(ido != 2)
     {
-        for(int k = 0; k < l1ido; k += ido)
+        for(int k{0};k < l1ido;k += ido)
         {
-            const v4sf *RESTRICT pc = cc + 1 + k;
-            for(int i=2; i<ido; i += 2, pc += 2)
+            const v4sf *RESTRICT pc{cc + 1 + k};
+            for(int i{2};i < ido;i += 2, pc += 2)
             {
-                const int ic = ido - i;
-                v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
-                v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
-
-                cr2 = pc[1*l1ido+0];
-                ci2 = pc[1*l1ido+1];
-                wr=LD_PS1(wa1[i - 2]);
-                wi=LD_PS1(wa1[i - 1]);
+                const int ic{ido - i};
+
+                v4sf cr2{pc[1*l1ido+0]};
+                v4sf ci2{pc[1*l1ido+1]};
+                v4sf wr{LD_PS1(wa1[i - 2])};
+                v4sf wi{LD_PS1(wa1[i - 1])};
                 VCPLXMULCONJ(cr2,ci2,wr,wi);
 
-                cr3 = pc[2*l1ido+0];
-                ci3 = pc[2*l1ido+1];
+                v4sf cr3{pc[2*l1ido+0]};
+                v4sf ci3{pc[2*l1ido+1]};
                 wr = LD_PS1(wa2[i-2]);
                 wi = LD_PS1(wa2[i-1]);
                 VCPLXMULCONJ(cr3, ci3, wr, wi);
 
-                cr4 = pc[3*l1ido];
-                ci4 = pc[3*l1ido+1];
+                v4sf cr4{pc[3*l1ido]};
+                v4sf ci4{pc[3*l1ido+1]};
                 wr = LD_PS1(wa3[i-2]);
                 wi = LD_PS1(wa3[i-1]);
                 VCPLXMULCONJ(cr4, ci4, wr, wi);
 
                 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
 
-                tr1 = VADD(cr2,cr4);
-                tr4 = VSUB(cr4,cr2);
-                tr2 = VADD(pc[0],cr3);
-                tr3 = VSUB(pc[0],cr3);
+                v4sf tr1{VADD(cr2,cr4)};
+                v4sf tr4{VSUB(cr4,cr2)};
+                v4sf tr2{VADD(pc[0],cr3)};
+                v4sf tr3{VSUB(pc[0],cr3)};
                 ch[i - 1 + 4*k] = VADD(tr1,tr2);
                 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
-                ti1 = VADD(ci2,ci4);
-                ti4 = VSUB(ci2,ci4);
+                v4sf ti1{VADD(ci2,ci4)};
+                v4sf ti4{VSUB(ci2,ci4)};
                 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
                 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
-                ti2 = VADD(pc[1],ci3);
-                ti3 = VSUB(pc[1],ci3);
+                v4sf ti2{VADD(pc[1],ci3)};
+                v4sf ti3{VSUB(pc[1],ci3)};
                 ch[i + 4*k] = VADD(ti1, ti2);
                 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
                 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
@@ -818,12 +818,13 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
         if((ido&1) == 1)
             return;
     }
-    for(int k=0; k<l1ido; k += ido)
+    const v4sf minus_hsqt2{LD_PS1(al::numbers::sqrt2_v<float> * -0.5f)};
+    for(int k{0};k < l1ido;k += ido)
     {
-        v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
-        v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
-        v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
-        v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
+        v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]};
+        v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]};
+        v4sf ti1{VMUL(minus_hsqt2, VADD(a, b))};
+        v4sf tr1{VMUL(minus_hsqt2, VSUB(b, a))};
         ch[ido-1 + 4*k] = VADD(tr1, c);
         ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
         ch[4*k + 1*ido] = VSUB(ti1, d);
@@ -832,23 +833,23 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
 } /* radf4 */
 
 
-static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf *RESTRICT ch,
-    const float *RESTRICT wa1, const float *RESTRICT wa2, const float *RESTRICT wa3)
+static NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT cc,
+    v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
+    const float *RESTRICT wa3)
 {
-    static constexpr float minus_sqrt2 = -1.414213562373095f;
-    static constexpr float two = 2.f;
-    const int l1ido = l1*ido;
+    const v4sf two{LD_PS1(2.0f)};
+    const int l1ido{l1*ido};
     {
-        const v4sf *RESTRICT cc_ = cc, *RESTRICT ch_end = ch + l1ido;
-        v4sf *ch_ = ch;
+        const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido};
+        v4sf *ch_{ch};
         while(ch != ch_end)
         {
-            v4sf a = cc[0], b = cc[4*ido-1];
-            v4sf c = cc[2*ido], d = cc[2*ido-1];
-            v4sf tr3 = SVMUL(two,d);
-            v4sf tr2 = VADD(a,b);
-            v4sf tr1 = VSUB(a,b);
-            v4sf tr4 = SVMUL(two,c);
+            v4sf a{cc[0]}, b{cc[4*ido-1]};
+            v4sf c{cc[2*ido]}, d{cc[2*ido-1]};
+            v4sf tr3{VMUL(two,d)};
+            v4sf tr2{VADD(a,b)};
+            v4sf tr1{VSUB(a,b)};
+            v4sf tr4{VMUL(two,c)};
             ch[0*l1ido] = VADD(tr2, tr3);
             ch[2*l1ido] = VSUB(tr2, tr3);
             ch[1*l1ido] = VSUB(tr1, tr4);
@@ -862,31 +863,31 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
         return;
     if(ido != 2)
     {
-        for(int k = 0; k < l1ido; k += ido)
+        for(int k{0};k < l1ido;k += ido)
         {
-            const v4sf *RESTRICT pc = cc - 1 + 4*k;
-            v4sf *RESTRICT ph = ch + k + 1;
-            for(int i = 2; i < ido; i += 2)
+            const v4sf *RESTRICT pc{cc - 1 + 4*k};
+            v4sf *RESTRICT ph{ch + k + 1};
+            for(int i{2};i < ido;i += 2)
             {
-                v4sf tr1 = VSUB(pc[i], pc[4*ido - i]);
-                v4sf tr2 = VADD(pc[i], pc[4*ido - i]);
-                v4sf ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
-                v4sf tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
+                v4sf tr1{VSUB(pc[i], pc[4*ido - i])};
+                v4sf tr2{VADD(pc[i], pc[4*ido - i])};
+                v4sf ti4{VSUB(pc[2*ido + i], pc[2*ido - i])};
+                v4sf tr3{VADD(pc[2*ido + i], pc[2*ido - i])};
                 ph[0] = VADD(tr2, tr3);
-                v4sf cr3 = VSUB(tr2, tr3);
+                v4sf cr3{VSUB(tr2, tr3)};
 
-                v4sf ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
-                v4sf tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
-                v4sf cr2 = VSUB(tr1, tr4);
-                v4sf cr4 = VADD(tr1, tr4);
+                v4sf ti3{VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1])};
+                v4sf tr4{VADD(pc[2*ido + i + 1], pc[2*ido - i + 1])};
+                v4sf cr2{VSUB(tr1, tr4)};
+                v4sf cr4{VADD(tr1, tr4)};
 
-                v4sf ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
-                v4sf ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
+                v4sf ti1{VADD(pc[i + 1], pc[4*ido - i + 1])};
+                v4sf ti2{VSUB(pc[i + 1], pc[4*ido - i + 1])};
 
                 ph[1] = VADD(ti2, ti3); ph += l1ido;
-                v4sf ci3 = VSUB(ti2, ti3);
-                v4sf ci2 = VADD(ti1, ti4);
-                v4sf ci4 = VSUB(ti1, ti4);
+                v4sf ci3{VSUB(ti2, ti3)};
+                v4sf ci2{VADD(ti1, ti4)};
+                v4sf ci4{VSUB(ti1, ti4)};
                 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
                 ph[0] = cr2;
                 ph[1] = ci2; ph += l1ido;
@@ -901,92 +902,93 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
         if((ido&1) == 1)
             return;
     }
-    for(int k=0; k < l1ido; k+=ido)
+    const v4sf minus_sqrt2{LD_PS1(-1.414213562373095f)};
+    for(int k{0};k < l1ido;k += ido)
     {
-        const int i0 = 4*k + ido;
-        v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
-        v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
-        v4sf tr1 = VSUB(c,d);
-        v4sf tr2 = VADD(c,d);
-        v4sf ti1 = VADD(b,a);
-        v4sf ti2 = VSUB(b,a);
+        const int i0{4*k + ido};
+        v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]};
+        v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]};
+        v4sf tr1{VSUB(c,d)};
+        v4sf tr2{VADD(c,d)};
+        v4sf ti1{VADD(b,a)};
+        v4sf ti2{VSUB(b,a)};
         ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
-        ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
+        ch[ido-1 + k + 1*l1ido] = VMUL(minus_sqrt2, VSUB(ti1, tr1));
         ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
-        ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
+        ch[ido-1 + k + 3*l1ido] = VMUL(minus_sqrt2, VADD(ti1, tr1));
     }
 } /* radb4 */
 
-static void radf5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
-    const float *wa2, const float *wa3, const float *wa4)
+static void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+    const float *wa1, const float *wa2, const float *wa3, const float *wa4)
 {
-    static constexpr float tr11 = 0.309016994374947f;
-    static constexpr float ti11 = 0.951056516295154f;
-    static constexpr float tr12 = -0.809016994374947f;
-    static constexpr float ti12 = 0.587785252292473f;
+    const v4sf tr11{LD_PS1(0.309016994374947f)};
+    const v4sf ti11{LD_PS1(0.951056516295154f)};
+    const v4sf tr12{LD_PS1(-0.809016994374947f)};
+    const v4sf ti12{LD_PS1(0.587785252292473f)};
 
 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
 
     /* Parameter adjustments */
-    const int ch_offset = 1 + ido * 6;
+    const int ch_offset{1 + ido * 6};
     ch -= ch_offset;
-    const int cc_offset = 1 + ido * (1 + l1);
+    const int cc_offset{1 + ido * (1 + l1)};
     cc -= cc_offset;
 
     /* Function Body */
-    for(int k = 1; k <= l1; ++k)
+    for(int k{1};k <= l1;++k)
     {
-        v4sf cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
-        v4sf ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
-        v4sf cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
-        v4sf ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
+        v4sf cr2{VADD(cc_ref(1, k, 5), cc_ref(1, k, 2))};
+        v4sf ci5{VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2))};
+        v4sf cr3{VADD(cc_ref(1, k, 4), cc_ref(1, k, 3))};
+        v4sf ci4{VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3))};
         ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
-        ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
-        ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
-        ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
-        ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+        ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(VMUL(tr11, cr2), VMUL(tr12, cr3)));
+        ch_ref(1, 3, k) = VADD(VMUL(ti11, ci5), VMUL(ti12, ci4));
+        ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(VMUL(tr12, cr2), VMUL(tr11, cr3)));
+        ch_ref(1, 5, k) = VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4));
         //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
     }
     if(ido == 1)
       return;
 
-    const int idp2 = ido + 2;
-    for(int k = 1; k <= l1; ++k)
+    const int idp2{ido + 2};
+    for(int k{1};k <= l1;++k)
     {
-        for(int i = 3; i <= ido; i += 2)
+        for(int i{3};i <= ido;i += 2)
         {
-            const int ic = idp2 - i;
-            v4sf dr2 = LD_PS1(wa1[i-3]);
-            v4sf di2 = LD_PS1(wa1[i-2]);
-            v4sf dr3 = LD_PS1(wa2[i-3]);
-            v4sf di3 = LD_PS1(wa2[i-2]);
-            v4sf dr4 = LD_PS1(wa3[i-3]);
-            v4sf di4 = LD_PS1(wa3[i-2]);
-            v4sf dr5 = LD_PS1(wa4[i-3]);
-            v4sf di5 = LD_PS1(wa4[i-2]);
+            const int ic{idp2 - i};
+            v4sf dr2{LD_PS1(wa1[i-3])};
+            v4sf di2{LD_PS1(wa1[i-2])};
+            v4sf dr3{LD_PS1(wa2[i-3])};
+            v4sf di3{LD_PS1(wa2[i-2])};
+            v4sf dr4{LD_PS1(wa3[i-3])};
+            v4sf di4{LD_PS1(wa3[i-2])};
+            v4sf dr5{LD_PS1(wa4[i-3])};
+            v4sf di5{LD_PS1(wa4[i-2])};
             VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
             VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
             VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
             VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
-            v4sf cr2 = VADD(dr2, dr5);
-            v4sf ci5 = VSUB(dr5, dr2);
-            v4sf cr5 = VSUB(di2, di5);
-            v4sf ci2 = VADD(di2, di5);
-            v4sf cr3 = VADD(dr3, dr4);
-            v4sf ci4 = VSUB(dr4, dr3);
-            v4sf cr4 = VSUB(di3, di4);
-            v4sf ci3 = VADD(di3, di4);
+            v4sf cr2{VADD(dr2, dr5)};
+            v4sf ci5{VSUB(dr5, dr2)};
+            v4sf cr5{VSUB(di2, di5)};
+            v4sf ci2{VADD(di2, di5)};
+            v4sf cr3{VADD(dr3, dr4)};
+            v4sf ci4{VSUB(dr4, dr3)};
+            v4sf cr4{VSUB(di3, di4)};
+            v4sf ci3{VADD(di3, di4)};
             ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
-            ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
-            v4sf tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
-            v4sf ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
-            v4sf tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
-            v4sf ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
-            v4sf tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
-            v4sf ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
-            v4sf tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
-            v4sf ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+            ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));
+            v4sf tr2{VADD(cc_ref(i - 1, k, 1), VADD(VMUL(tr11, cr2), VMUL(tr12, cr3)))};
+            v4sf ti2{VSUB(cc_ref(i, k, 1), VADD(VMUL(tr11, ci2), VMUL(tr12, ci3)))};
+            v4sf tr3{VADD(cc_ref(i - 1, k, 1), VADD(VMUL(tr12, cr2), VMUL(tr11, cr3)))};
+            v4sf ti3{VSUB(cc_ref(i, k, 1), VADD(VMUL(tr12, ci2), VMUL(tr11, ci3)))};
+            v4sf tr5{VADD(VMUL(ti11, cr5), VMUL(ti12, cr4))};
+            v4sf ti5{VADD(VMUL(ti11, ci5), VMUL(ti12, ci4))};
+            v4sf tr4{VSUB(VMUL(ti12, cr5), VMUL(ti11, cr4))};
+            v4sf ti4{VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4))};
             ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
             ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
             ch_ref(i, 3, k) = VADD(ti2, ti5);
@@ -1001,35 +1003,35 @@ static void radf5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
 #undef ch_ref
 } /* radf5 */
 
-static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
-    const float *wa2, const float *wa3, const float *wa4)
+static void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+    const float *wa1, const float *wa2, const float *wa3, const float *wa4)
 {
-    static constexpr float tr11 = 0.309016994374947f;
-    static constexpr float ti11 = 0.951056516295154f;
-    static constexpr float tr12 = -0.809016994374947f;
-    static constexpr float ti12 = 0.587785252292473f;
+    const v4sf tr11{LD_PS1(0.309016994374947f)};
+    const v4sf ti11{LD_PS1(0.951056516295154f)};
+    const v4sf tr12{LD_PS1(-0.809016994374947f)};
+    const v4sf ti12{LD_PS1(0.587785252292473f)};
 
 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
 
     /* Parameter adjustments */
-    const int ch_offset = 1 + ido * (1 + l1);
+    const int ch_offset{1 + ido*(1 + l1)};
     ch -= ch_offset;
-    const int cc_offset = 1 + ido * 6;
+    const int cc_offset{1 + ido*6};
     cc -= cc_offset;
 
     /* Function Body */
-    for(int k = 1; k <= l1; ++k)
+    for(int k{1};k <= l1;++k)
     {
-        v4sf ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
-        v4sf ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
-        v4sf tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
-        v4sf tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
+        v4sf ti5{VADD(cc_ref(1, 3, k), cc_ref(1, 3, k))};
+        v4sf ti4{VADD(cc_ref(1, 5, k), cc_ref(1, 5, k))};
+        v4sf tr2{VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k))};
+        v4sf tr3{VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k))};
         ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
-        v4sf cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
-        v4sf cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
-        v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
-        v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+        v4sf cr2{VADD(cc_ref(1, 1, k), VADD(VMUL(tr11, tr2), VMUL(tr12, tr3)))};
+        v4sf cr3{VADD(cc_ref(1, 1, k), VADD(VMUL(tr12, tr2), VMUL(tr11, tr3)))};
+        v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))};
+        v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))};
         ch_ref(1, k, 2) = VSUB(cr2, ci5);
         ch_ref(1, k, 3) = VSUB(cr3, ci4);
         ch_ref(1, k, 4) = VADD(cr3, ci4);
@@ -1038,38 +1040,38 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
     if(ido == 1)
         return;
 
-    const int idp2 = ido + 2;
-    for(int k = 1; k <= l1; ++k)
+    const int idp2{ido + 2};
+    for(int k{1};k <= l1;++k)
     {
-        for(int i = 3; i <= ido; i += 2)
+        for(int i{3};i <= ido;i += 2)
         {
-            const int ic = idp2 - i;
-            v4sf ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
-            v4sf ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
-            v4sf ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
-            v4sf ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
-            v4sf tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
-            v4sf tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
-            v4sf tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
-            v4sf tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+            const int ic{idp2 - i};
+            v4sf ti5{VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k))};
+            v4sf ti2{VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k))};
+            v4sf ti4{VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k))};
+            v4sf ti3{VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k))};
+            v4sf tr5{VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
+            v4sf tr2{VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
+            v4sf tr4{VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
+            v4sf tr3{VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
             ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
             ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
-            v4sf cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
-            v4sf ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
-            v4sf cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
-            v4sf ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
-            v4sf cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
-            v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
-            v4sf cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
-            v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
-            v4sf dr3 = VSUB(cr3, ci4);
-            v4sf dr4 = VADD(cr3, ci4);
-            v4sf di3 = VADD(ci3, cr4);
-            v4sf di4 = VSUB(ci3, cr4);
-            v4sf dr5 = VADD(cr2, ci5);
-            v4sf dr2 = VSUB(cr2, ci5);
-            v4sf di5 = VSUB(ci2, cr5);
-            v4sf di2 = VADD(ci2, cr5);
+            v4sf cr2{VADD(cc_ref(i-1, 1, k), VADD(VMUL(tr11, tr2), VMUL(tr12, tr3)))};
+            v4sf ci2{VADD(cc_ref(i  , 1, k), VADD(VMUL(tr11, ti2), VMUL(tr12, ti3)))};
+            v4sf cr3{VADD(cc_ref(i-1, 1, k), VADD(VMUL(tr12, tr2), VMUL(tr11, tr3)))};
+            v4sf ci3{VADD(cc_ref(i  , 1, k), VADD(VMUL(tr12, ti2), VMUL(tr11, ti3)))};
+            v4sf cr5{VADD(VMUL(ti11, tr5), VMUL(ti12, tr4))};
+            v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))};
+            v4sf cr4{VSUB(VMUL(ti12, tr5), VMUL(ti11, tr4))};
+            v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))};
+            v4sf dr3{VSUB(cr3, ci4)};
+            v4sf dr4{VADD(cr3, ci4)};
+            v4sf di3{VADD(ci3, cr4)};
+            v4sf di4{VSUB(ci3, cr4)};
+            v4sf dr5{VADD(cr2, ci5)};
+            v4sf dr2{VSUB(cr2, ci5)};
+            v4sf di5{VSUB(ci2, cr5)};
+            v4sf di2{VADD(ci2, cr5)};
             VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
             VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
             VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
@@ -1085,45 +1087,52 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
 #undef ch_ref
 } /* radb5 */
 
-static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
-    const float *wa, const int *ifac)
+static NEVER_INLINE(v4sf *) rfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1,
+    v4sf *work2, const float *wa, const int *ifac)
 {
-    const v4sf *in  = input_readonly;
-    v4sf *out = (in == work2 ? work1 : work2);
-    const int nf = ifac[1];
-    int l2 = n;
-    int iw = n-1;
-    assert(in != out && work1 != work2);
-    for(int k1 = 1; k1 <= nf; ++k1)
+    assert(work1 != work2);
+
+    const v4sf *in{input_readonly};
+    v4sf *out{in == work2 ? work1 : work2};
+    const int nf{ifac[1]};
+    int l2{n};
+    int iw{n-1};
+    for(int k1{1};k1 <= nf;++k1)
     {
-        int kh = nf - k1;
-        int ip = ifac[kh + 2];
-        int l1 = l2 / ip;
-        int ido = n / l2;
+        int kh{nf - k1};
+        int ip{ifac[kh + 2]};
+        int l1{l2 / ip};
+        int ido{n / l2};
         iw -= (ip - 1)*ido;
-        switch (ip)
+        switch(ip)
         {
-            case 5: {
-                int ix2 = iw + ido;
-                int ix3 = ix2 + ido;
-                int ix4 = ix3 + ido;
+        case 5:
+            {
+                int ix2{iw + ido};
+                int ix3{ix2 + ido};
+                int ix4{ix3 + ido};
                 radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-            }   break;
-            case 4: {
-                int ix2 = iw + ido;
-                int ix3 = ix2 + ido;
+            }
+            break;
+        case 4:
+            {
+                int ix2{iw + ido};
+                int ix3{ix2 + ido};
                 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
-            }   break;
-            case 3: {
-                int ix2 = iw + ido;
+            }
+            break;
+        case 3:
+            {
+                int ix2{iw + ido};
                 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
-            }   break;
-            case 2:
-                radf2_ps(ido, l1, in, out, &wa[iw]);
-                break;
-            default:
-                assert(0);
-                break;
+            }
+            break;
+        case 2:
+            radf2_ps(ido, l1, in, out, &wa[iw]);
+            break;
+        default:
+            assert(0);
+            break;
         }
         l2 = l1;
         if(out == work2)
@@ -1140,43 +1149,50 @@ static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *w
     return const_cast<v4sf*>(in); /* this is in fact the output .. */
 } /* rfftf1 */
 
-static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
-    const float *wa, const int *ifac)
+static NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v4sf *work1,
+    v4sf *work2, const float *wa, const int *ifac)
 {
-    const v4sf *in = input_readonly;
-    v4sf *out = (in == work2 ? work1 : work2);
-    const int nf = ifac[1];
-    int l1 = 1;
-    int iw = 0;
-    assert(in != out);
-    for(int k1=1; k1<=nf; k1++)
+    assert(work1 != work2);
+
+    const v4sf *in{input_readonly};
+    v4sf *out{in == work2 ? work1 : work2};
+    const int nf{ifac[1]};
+    int l1{1};
+    int iw{0};
+    for(int k1{1};k1 <= nf;++k1)
     {
-        int ip = ifac[k1 + 1];
-        int l2 = ip*l1;
-        int ido = n / l2;
+        int ip{ifac[k1 + 1]};
+        int l2{ip*l1};
+        int ido{n / l2};
         switch(ip)
         {
-            case 5: {
-                int ix2 = iw + ido;
-                int ix3 = ix2 + ido;
-                int ix4 = ix3 + ido;
+        case 5:
+            {
+                int ix2{iw + ido};
+                int ix3{ix2 + ido};
+                int ix4{ix3 + ido};
                 radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-            }   break;
-            case 4: {
-                int ix2 = iw + ido;
-                int ix3 = ix2 + ido;
+            }
+            break;
+        case 4:
+            {
+                int ix2{iw + ido};
+                int ix3{ix2 + ido};
                 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
-            }   break;
-            case 3: {
-                int ix2 = iw + ido;
+            }
+            break;
+        case 3:
+            {
+                int ix2{iw + ido};
                 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
-            }   break;
-            case 2:
-                radb2_ps(ido, l1, in, out, &wa[iw]);
-                break;
-            default:
-                assert(0);
-                break;
+            }
+            break;
+        case 2:
+            radb2_ps(ido, l1, in, out, &wa[iw]);
+            break;
+        default:
+            assert(0);
+            break;
         }
         l1 = l2;
         iw += (ip - 1)*ido;
@@ -1195,32 +1211,30 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
     return const_cast<v4sf*>(in); /* this is in fact the output .. */
 }
 
-static int decompose(int n, int *ifac, const int *ntryh)
+static int decompose(const int n, int *ifac, const int *ntryh)
 {
-    int nl = n, nf = 0;
-    for(int j=0; ntryh[j]; ++j)
+    int nl{n}, nf{0};
+    for(int j{0};ntryh[j];++j)
     {
-        const int ntry = ntryh[j];
+        const int ntry{ntryh[j]};
         while(nl != 1)
         {
-            int nq = nl / ntry;
-            int nr = nl - ntry*nq;
-            if(nr == 0)
+            const int nq{nl / ntry};
+            const int nr{nl - ntry*nq};
+            if(nr != 0)
+                break;
+
+            ifac[2+nf++] = ntry;
+            nl = nq;
+            if(ntry == 2 && nf != 1)
             {
-                ifac[2+nf++] = ntry;
-                nl = nq;
-                if(ntry == 2 && nf != 1)
+                for(int i{2};i <= nf;++i)
                 {
-                    for(int i = 2; i <= nf; ++i)
-                    {
-                        int ib = nf - i + 2;
-                        ifac[ib + 1] = ifac[ib];
-                    }
-                    ifac[2] = 2;
+                    int ib{nf - i + 2};
+                    ifac[ib + 1] = ifac[ib];
                 }
+                ifac[2] = 2;
             }
-            else
-                break;
         }
     }
     ifac[0] = n;
@@ -1230,28 +1244,28 @@ static int decompose(int n, int *ifac, const int *ntryh)
 
 
 
-static void rffti1_ps(int n, float *wa, int *ifac)
+static void rffti1_ps(const int n, float *wa, int *ifac)
 {
-    static constexpr int ntryh[] = { 4,2,3,5,0 };
-
-    const int nf = decompose(n,ifac,ntryh);
-    const double argh = 2.0*al::numbers::pi / n;
-    int is = 0;
-    int nfm1 = nf - 1;
-    int l1 = 1;
-    for(int k1 = 1; k1 <= nfm1; k1++)
+    static constexpr int ntryh[]{4,2,3,5,0};
+
+    const int nf{decompose(n, ifac, ntryh)};
+    const double argh{2.0*al::numbers::pi / n};
+    int is{0};
+    int nfm1{nf - 1};
+    int l1{1};
+    for(int k1{1};k1 <= nfm1;++k1)
     {
-        int ip = ifac[k1 + 1];
-        int ld = 0;
-        int l2 = l1*ip;
-        int ido = n / l2;
-        int ipm = ip - 1;
-        for(int j = 1; j <= ipm; ++j)
+        const int ip{ifac[k1 + 1]};
+        const int l2{l1*ip};
+        const int ido{n / l2};
+        const int ipm{ip - 1};
+        int ld{0};
+        for(int j{1};j <= ipm;++j)
         {
-            int i = is, fi=0;
+            int i{is}, fi{0};
             ld += l1;
-            double argld = ld*argh;
-            for(int ii = 3; ii <= ido; ii += 2)
+            double argld{ld*argh};
+            for(int ii{3};ii <= ido;ii += 2)
             {
                 i += 2;
                 fi += 1;
@@ -1264,25 +1278,25 @@ static void rffti1_ps(int n, float *wa, int *ifac)
     }
 } /* rffti1 */
 
-void cffti1_ps(int n, float *wa, int *ifac)
+void cffti1_ps(const int n, float *wa, int *ifac)
 {
-    static constexpr int ntryh[] = { 5,3,4,2,0 };
+    static constexpr int ntryh[]{5,3,4,2,0};
 
-    const int nf = decompose(n,ifac,ntryh);
-    const double argh = 2.0*al::numbers::pi / n;
-    int i = 1;
-    int l1 = 1;
-    for(int k1=1; k1<=nf; k1++)
+    const int nf{decompose(n, ifac, ntryh)};
+    const double argh{2.0*al::numbers::pi / n};
+    int i{1};
+    int l1{1};
+    for(int k1{1};k1 <= nf;++k1)
     {
-        int ip = ifac[k1+1];
-        int ld = 0;
-        int l2 = l1*ip;
-        int ido = n / l2;
-        int idot = ido + ido + 2;
-        int ipm = ip - 1;
-        for(int j=1; j<=ipm; j++)
+        const int ip{ifac[k1+1]};
+        const int l2{l1*ip};
+        const int ido{n / l2};
+        const int idot{ido + ido + 2};
+        const int ipm{ip - 1};
+        int ld{0};
+        for(int j{1};j <= ipm;++j)
         {
-            int i1 = i, fi = 0;
+            int i1{i}, fi{0};
             wa[i-1] = 1;
             wa[i] = 0;
             ld += l1;
@@ -1305,43 +1319,49 @@ void cffti1_ps(int n, float *wa, int *ifac)
 } /* cffti1 */
 
 
-v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa,
-    const int *ifac, float fsign)
+v4sf *cfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa,
+    const int *ifac, const float fsign)
 {
-    const v4sf *in = input_readonly;
-    v4sf *out = (in == work2 ? work1 : work2);
-    const int nf = ifac[1];
-    int l1 = 1;
-    int iw = 0;
-    assert(in != out && work1 != work2);
-    for(int k1=2; k1<=nf+1; k1++)
+    assert(work1 != work2);
+
+    const v4sf *in{input_readonly};
+    v4sf *out{in == work2 ? work1 : work2};
+    const int nf{ifac[1]};
+    int l1{1}, iw{0};
+    for(int k1{2};k1 <= nf+1;++k1)
     {
-        int ip = ifac[k1];
-        int l2 = ip*l1;
-        int ido = n / l2;
-        int idot = ido + ido;
+        const int ip{ifac[k1]};
+        const int l2{ip*l1};
+        const int ido{n / l2};
+        const int idot{ido + ido};
         switch(ip)
         {
-            case 5: {
-                int ix2 = iw + idot;
-                int ix3 = ix2 + idot;
-                int ix4 = ix3 + idot;
+        case 5:
+            {
+                int ix2{iw + idot};
+                int ix3{ix2 + idot};
+                int ix4{ix3 + idot};
                 passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], fsign);
-            }   break;
-            case 4: {
-                int ix2 = iw + idot;
-                int ix3 = ix2 + idot;
+            }
+            break;
+        case 4:
+            {
+                int ix2{iw + idot};
+                int ix3{ix2 + idot};
                 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], fsign);
-            }   break;
-            case 2:
-                passf2_ps(idot, l1, in, out, &wa[iw], fsign);
-                break;
-            case 3: {
-                int ix2 = iw + idot;
+            }
+            break;
+        case 3:
+            {
+                int ix2{iw + idot};
                 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], fsign);
-            }   break;
-            default:
-                assert(0);
+            }
+            break;
+        case 2:
+            passf2_ps(idot, l1, in, out, &wa[iw], fsign);
+            break;
+        default:
+            assert(0);
         }
         l1 = l2;
         iw += (ip - 1)*idot;
@@ -1362,8 +1382,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
 
 
 struct PFFFT_Setup {
-    int     N;
-    int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
+    int N;
+    int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
     int ifac[15];
     pffft_transform_t transform;
 
@@ -1384,13 +1404,13 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
     else
         assert((N%(SIMD_SZ*SIMD_SZ)) == 0);
 
-    const unsigned int Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
-    size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))};
+    const auto Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+    const size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))};
 
     void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)};
     if(!store) return nullptr;
 
-    PFFFT_Setup *s = ::new(store) PFFFT_Setup{};
+    PFFFT_Setup *s{::new(store) PFFFT_Setup{}};
     s->N = N;
     s->transform = transform;
     /* nb of complex simd vectors */
@@ -1400,10 +1420,10 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
     if constexpr(SIMD_SZ > 1)
     {
         al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1));
-        for(int k=0; k < s->Ncvec; ++k)
+        for(int k{0};k < s->Ncvec;++k)
         {
-            size_t i{static_cast<size_t>(k) / SIMD_SZ};
-            size_t j{static_cast<size_t>(k) % SIMD_SZ};
+            const size_t i{static_cast<size_t>(k) / SIMD_SZ};
+            const size_t j{static_cast<size_t>(k) % SIMD_SZ};
             for(size_t m{0};m < SIMD_SZ-1;++m)
             {
                 const double A = -2.0*al::numbers::pi*static_cast<double>(m+1)*k / N;
@@ -1419,8 +1439,8 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
         cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
 
     /* check that N is decomposable with allowed prime factors */
-    int m = 1;
-    for(int k=0; k < s->ifac[1]; ++k)
+    int m{1};
+    for(int k{0};k < s->ifac[1];++k)
         m *= s->ifac[2+k];
 
     if(m != N/SIMD_SZ)
@@ -1442,17 +1462,18 @@ void pffft_destroy_setup(PFFFT_Setup *s)
 #if !defined(PFFFT_SIMD_DISABLE)
 
 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
-static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out)
+static void reversed_copy(const int N, const v4sf *in, const int in_stride, v4sf *out)
 {
     v4sf g0, g1;
     INTERLEAVE2(in[0], in[1], g0, g1);
     in += in_stride;
 
     *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
-    for(int k=1; k < N; ++k)
+    for(int k{1};k < N;++k)
     {
         v4sf h0, h1;
-        INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
+        INTERLEAVE2(in[0], in[1], h0, h1);
+        in += in_stride;
         *--out = VSWAPHL(g1, h0);
         *--out = VSWAPHL(h0, h1);
         g1 = h1;
@@ -1460,20 +1481,20 @@ static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out)
     *--out = VSWAPHL(g1, g0);
 }
 
-static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride)
+static void unreversed_copy(const int N, const v4sf *in, v4sf *out, const int out_stride)
 {
-    v4sf g0, g1, h0, h1;
-    g0 = g1 = in[0]; ++in;
-    for(int k=1; k < N; ++k)
+    v4sf g0{in[0]}, g1{g0};
+    ++in;
+    for(int k{1};k < N;++k)
     {
-        h0 = *in++; h1 = *in++;
+        v4sf h0{*in++}; v4sf h1{*in++};
         g1 = VSWAPHL(g1, h0);
         h0 = VSWAPHL(h0, h1);
         UNINTERLEAVE2(h0, g1, out[0], out[1]);
         out += out_stride;
         g1 = h1;
     }
-    h0 = *in++; h1 = g0;
+    v4sf h0{*in++}, h1{g0};
     g1 = VSWAPHL(g1, h0);
     h0 = VSWAPHL(h0, h1);
     UNINTERLEAVE2(h0, g1, out[0], out[1]);
@@ -1491,7 +1512,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
         const int dk{N/32};
         if(direction == PFFFT_FORWARD)
         {
-            for(int k=0; k < dk; ++k)
+            for(int k{0};k < dk;++k)
             {
                 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
                 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
@@ -1613,10 +1634,10 @@ void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4s
 static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1,
     const v4sf *in, const v4sf *e, v4sf *out)
 {
-    v4sf r0, i0, r1, i1, r2, i2, r3, i3;
-    v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
-    r0 = *in0; i0 = *in1;
-    r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
+    v4sf r0{*in0}, i0{*in1};
+    v4sf r1{*in++}; v4sf i1{*in++};
+    v4sf r2{*in++}; v4sf i2{*in++};
+    v4sf r3{*in++}; v4sf i3{*in++};
     VTRANSPOSE4(r0,r1,r2,r3);
     VTRANSPOSE4(i0,i1,i2,i3);
 
@@ -1643,10 +1664,10 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
     //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
     //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
 
-    sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
-    sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
-    si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
-    si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
+    v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0,r2)};
+    v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r3,r1)};
+    v4sf si0{VADD(i0,i2)}, di0{VSUB(i0,i2)};
+    v4sf si1{VADD(i1,i3)}, di1{VSUB(i3,i1)};
 
     r0 = VADD(sr0, sr1);
     r3 = VSUB(sr0, sr1);
@@ -1667,7 +1688,8 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
     *out++ = i3;
 }
 
-static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+static NEVER_INLINE(void) pffft_real_finalize(const int Ncvec, const v4sf *in, v4sf *out,
+    const v4sf *e)
 {
     static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
 
@@ -1706,9 +1728,10 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
 }
 
 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
-    int first)
+    const bool first)
 {
-    v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
+    v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
+    v4sf r2{in[4]}, i2{in[5]}, r3{in[6]}, i3{in[7]};
 
     /* transformation for each column is:
      *
@@ -1722,10 +1745,10 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf
      * [0   1  -1   0   1   0   0   1]   [i3]
      */
 
-    v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
-    v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
-    v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
-    v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
+    v4sf sr0{VADD(r0,r3)}, dr0{VSUB(r0,r3)};
+    v4sf sr1{VADD(r1,r2)}, dr1{VSUB(r1,r2)};
+    v4sf si0{VADD(i0,i3)}, di0{VSUB(i0,i3)};
+    v4sf si1{VADD(i1,i2)}, di1{VSUB(i1,i2)};
 
     r0 = VADD(sr0, sr1);
     r2 = VSUB(sr0, sr1);
@@ -1756,9 +1779,10 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf
     *out++ = i3;
 }
 
-static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+static NEVER_INLINE(void) pffft_real_preprocess(const int Ncvec, const v4sf *in, v4sf *out,
+    const v4sf *e)
 {
-    static constexpr float s = al::numbers::sqrt2_v<float>;
+    static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
 
     assert(in != out);
     const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
@@ -1771,7 +1795,7 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
         Xi[k] = VEXTRACT0(in[4*k + 1]);
     }
 
-    pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
+    pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values
 
     /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
      *
@@ -1785,34 +1809,30 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
      * [ci3] [0  -s   0   s   0  -s   0  -s]
      */
     for(int k{1};k < dk;++k)
-        pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
+        pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false);
 
     const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]};
     const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]};
     const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]};
     const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]};
     out[0] = VSET4(cr0, cr1, cr2, cr3);
-    const float ci0{ 2*(Xr[1]+Xr[3])};
-    const float ci1{ s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])};
-    const float ci2{ 2*(Xi[3]-Xi[1])};
-    const float ci3{-s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])};
+    const float ci0{     2*(Xr[1]+Xr[3])};
+    const float ci1{ sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
+    const float ci2{     2*(Xi[3]-Xi[1])};
+    const float ci3{-sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
     out[2*Ncvec-1] = VSET4(ci0, ci1, ci2, ci3);
 }
 
 
-void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput,
-    v4sf *scratch, pffft_direction_t direction, int ordered)
+void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput,
+    v4sf *scratch, const pffft_direction_t direction, const bool ordered)
 {
     assert(scratch != nullptr);
-    assert(VALIGNED(finput) && VALIGNED(foutput) && VALIGNED(scratch));
+    assert(voutput != scratch);
 
     const int Ncvec{setup->Ncvec};
     const int nf_odd{setup->ifac[1] & 1};
 
-    auto *vinput = reinterpret_cast<const v4sf*>(finput);
-    auto *voutput = reinterpret_cast<v4sf*>(foutput);
-    assert(voutput != scratch);
-
     v4sf *buff[2]{voutput, scratch};
     int ib{(nf_odd ^ ordered) ? 1 : 0};
     if(direction == PFFFT_FORWARD)
@@ -1870,21 +1890,18 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
     if(buff[ib] != voutput)
     {
         /* extra copy required -- this situation should only happen when finput == foutput */
-        assert(finput==foutput);
+        assert(vinput==voutput);
         for(int k{0};k < Ncvec;++k)
         {
             v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
             voutput[2*k] = a; voutput[2*k+1] = b;
         }
-        ib = !ib;
     }
 }
 
 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
     float scaling)
 {
-    assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
-
     const int Ncvec{s->Ncvec};
     const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
     const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
@@ -1911,12 +1928,12 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 #ifndef ZCONVOLVE_USING_INLINE_ASM
     const v4sf vscal{LD_PS1(scaling)};
 #endif
-    float ar1{VEXTRACT0(va[0])};
-    float ai1{VEXTRACT0(va[1])};
-    float br1{VEXTRACT0(vb[0])};
-    float bi1{VEXTRACT0(vb[1])};
-    float abr1{VEXTRACT0(vab[0])};
-    float abi1{VEXTRACT0(vab[1])};
+    const float ar1{VEXTRACT0(va[0])};
+    const float ai1{VEXTRACT0(va[1])};
+    const float br1{VEXTRACT0(vb[0])};
+    const float bi1{VEXTRACT0(vb[1])};
+    const float abr1{VEXTRACT0(vab[0])};
+    const float abi1{VEXTRACT0(vab[1])};
 
 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
     const float *a_{a}, *b_{b}; float *ab_{ab};
@@ -1957,7 +1974,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 
 #else // default routine, works fine for non-arm cpus with current compilers
 
-    for(int i=0; i < Ncvec; i += 2)
+    for(int i{0};i < Ncvec;i += 2)
     {
         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
@@ -1980,6 +1997,22 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 }
 
 
+void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+{
+    assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+    pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
+        reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
+        reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
+}
+
+void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+{
+    assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+    pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
+        reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
+        reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
+}
+
 #else // defined(PFFFT_SIMD_DISABLE)
 
 // standard routine using scalar floats, without SIMD stuff.
@@ -1988,25 +2021,25 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
     pffft_direction_t direction)
 {
-    const int N = setup->N;
+    const int N{setup->N};
     if(setup->transform == PFFFT_COMPLEX)
     {
-        for(int k=0; k < 2*N; ++k)
+        for(int k{0};k < 2*N;++k)
             out[k] = in[k];
         return;
     }
     else if(direction == PFFFT_FORWARD)
     {
-        float x_N = in[N-1];
-        for(int k=N-1; k > 1; --k)
+        float x_N{in[N-1]};
+        for(int k{N-1};k > 1;--k)
             out[k] = in[k-1];
         out[0] = in[0];
         out[1] = x_N;
     }
     else
     {
-        float x_N = in[1];
-        for(int k=1; k < N-1; ++k)
+        float x_N{in[1]};
+        for(int k{1};k < N-1;++k)
             out[k] = in[k+1];
         out[0] = in[0];
         out[N-1] = x_N;
@@ -2015,7 +2048,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
 
 #define pffft_transform_internal_nosimd pffft_transform_internal
 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
-    float *scratch, pffft_direction_t direction, int ordered)
+    float *scratch, const pffft_direction_t direction, bool ordered)
 {
     const int Ncvec{setup->Ncvec};
     const int nf_odd{setup->ifac[1] & 1};
@@ -2061,12 +2094,11 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
     {
         // extra copy required -- this situation should happens only when finput == foutput
         assert(input==output);
-        for(int k=0; k < Ncvec; ++k)
+        for(int k{0};k < Ncvec;++k)
         {
-            float a = buff[ib][2*k], b = buff[ib][2*k+1];
+            float a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
             output[2*k] = a; output[2*k+1] = b;
         }
-        ib = !ib;
     }
 }
 
@@ -2093,14 +2125,15 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo
     }
 }
 
-#endif // defined(PFFFT_SIMD_DISABLE)
 
 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
 {
-    pffft_transform_internal(setup, input, output, reinterpret_cast<v4sf*>(work), direction, 0);
+    pffft_transform_internal(setup, input, output, work, direction, false);
 }
 
 void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
 {
-    pffft_transform_internal(setup, input, output, reinterpret_cast<v4sf*>(work), direction, 1);
+    pffft_transform_internal(setup, input, output, work, direction, true);
 }
+
+#endif // defined(PFFFT_SIMD_DISABLE)
-- 
cgit v1.2.3


From 6a9c72760b785a4f7964bc6febbe04a5232df281 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Mon, 9 Oct 2023 05:26:19 -0700
Subject: Use a bool instead of an int for 0/1

Also update some comments.
---
 common/pffft.cpp | 52 +++++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 8eb5a19b..0c8bf063 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -218,7 +218,7 @@ using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
 constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 #define LD_PS1 ld_ps1
 #define VSET4(a, b, c, d) v4sf{(a), (b), (c), (d)}
-[[gnu::always_inline]] inline v4sf vinsert0(v4sf v, float a) noexcept
+constexpr v4sf vinsert0(v4sf v, float a) noexcept
 { return v4sf{a, v[1], v[2], v[3]}; }
 #define VINSERT0 vinsert0
 #define VEXTRACT0(v) ((v)[0])
@@ -305,7 +305,6 @@ void validate_pffft_simd()
     std::memcpy(&a2_v, f+8, 4*sizeof(float));
     std::memcpy(&a3_v, f+12, 4*sizeof(float));
 
-    t_v = a0_v; u_v = a1_v;
     t_v = VZERO(); t_f = al::bit_cast<float4>(t_v);
     printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 0, 0, 0, 0);
     t_v = VADD(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
@@ -1522,7 +1521,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
         }
         else
         {
-            for(int k=0; k < dk; ++k)
+            for(int k{0};k < dk;++k)
             {
                 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
                 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
@@ -1535,17 +1534,17 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
     {
         if(direction == PFFFT_FORWARD)
         {
-            for(int k=0; k < Ncvec; ++k)
+            for(int k{0};k < Ncvec;++k)
             {
-                int kk = (k/4) + (k%4)*(Ncvec/4);
+                int kk{(k/4) + (k%4)*(Ncvec/4)};
                 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
             }
         }
         else
         {
-            for(int k=0; k < Ncvec; ++k)
+            for(int k{0};k < Ncvec;++k)
             {
-                int kk = (k/4) + (k%4)*(Ncvec/4);
+                int kk{(k/4) + (k%4)*(Ncvec/4)};
                 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
             }
         }
@@ -1557,7 +1556,7 @@ void pffft_cplx_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf
     assert(in != out);
 
     const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
-    for(int k=0; k < dk; ++k)
+    for(int k{0};k < dk;++k)
     {
         v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
@@ -1601,7 +1600,7 @@ void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4s
     assert(in != out);
 
     const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
-    for(int k=0; k < dk; ++k)
+    for(int k{0};k < dk;++k)
     {
         v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
@@ -1641,8 +1640,7 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
     VTRANSPOSE4(r0,r1,r2,r3);
     VTRANSPOSE4(i0,i1,i2,i3);
 
-    /*
-     * transformation for each column is:
+    /* transformation for each column is:
      *
      * [1   1   1   1   0   0   0   0]   [r0]
      * [1   0  -1   0   0  -1   0   1]   [r1]
@@ -1831,10 +1829,10 @@ void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *vout
     assert(voutput != scratch);
 
     const int Ncvec{setup->Ncvec};
-    const int nf_odd{setup->ifac[1] & 1};
+    const bool nf_odd{(setup->ifac[1]&1) != 0};
 
     v4sf *buff[2]{voutput, scratch};
-    int ib{(nf_odd ^ ordered) ? 1 : 0};
+    bool ib{nf_odd != ordered};
     if(direction == PFFFT_FORWARD)
     {
         /* Swap the initial work buffer for forward FFTs, which helps avoid an
@@ -1925,9 +1923,6 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 #endif
 #endif
 
-#ifndef ZCONVOLVE_USING_INLINE_ASM
-    const v4sf vscal{LD_PS1(scaling)};
-#endif
     const float ar1{VEXTRACT0(va[0])};
     const float ai1{VEXTRACT0(va[1])};
     const float br1{VEXTRACT0(vb[0])};
@@ -1935,7 +1930,13 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
     const float abr1{VEXTRACT0(vab[0])};
     const float abi1{VEXTRACT0(vab[1])};
 
-#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
+#ifdef ZCONVOLVE_USING_INLINE_ASM
+    /* Inline asm version, unfortunately miscompiled by clang 3.2, at least on
+     * Ubuntu. So this will be restricted to GCC.
+     *
+     * Does it still miscompile with Clang? Is it even needed with today's
+     * optimizers?
+     */
     const float *a_{a}, *b_{b}; float *ab_{ab};
     int N{Ncvec};
     asm volatile("mov         r8, %2                  \n"
@@ -1972,8 +1973,10 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
                 "bne         1b                      \n"
                 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
 
-#else // default routine, works fine for non-arm cpus with current compilers
+#else
 
+    /* Default routine, works fine for non-arm cpus with current compilers. */
+    const v4sf vscal{LD_PS1(scaling)};
     for(int i{0};i < Ncvec;i += 2)
     {
         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
@@ -2051,17 +2054,16 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
     float *scratch, const pffft_direction_t direction, bool ordered)
 {
     const int Ncvec{setup->Ncvec};
-    const int nf_odd{setup->ifac[1] & 1};
+    const bool nf_odd{(setup->ifac[1]&1) != 0};
 
     assert(scratch != nullptr);
 
     /* z-domain data for complex transforms is already ordered without SIMD. */
     if(setup->transform == PFFFT_COMPLEX)
-        ordered = 0;
+        ordered = false;
 
     float *buff[2]{output, scratch};
-    int ib{(nf_odd ^ ordered) ? 1 : 0};
-
+    bool ib{nf_odd != ordered};
     if(direction == PFFFT_FORWARD)
     {
         if(setup->transform == PFFFT_REAL)
@@ -2115,10 +2117,10 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo
         ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
         ++ab; ++a; ++b; --Ncvec;
     }
-    for(int i=0; i < Ncvec; ++i)
+    for(int i{0};i < Ncvec;++i)
     {
-        float ar       = a[2*i+0], ai = a[2*i+1];
-        const float br = b[2*i+0], bi = b[2*i+1];
+        float ar{a[2*i+0]}, ai{a[2*i+1]};
+        const float br{b[2*i+0]}, bi{b[2*i+1]};
         VCPLXMUL(ar, ai, br, bi);
         ab[2*i+0] += ar*scaling;
         ab[2*i+1] += ai*scaling;
-- 
cgit v1.2.3


From 2d1c0e1050fc5844d5befb2c1b739463b0053ddc Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Tue, 10 Oct 2023 04:40:09 -0700
Subject: Combine some VADD(VMUL(... to VMADD(...

---
 common/pffft.cpp | 86 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 0c8bf063..2e9b6f67 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -68,6 +68,7 @@
 #include "albit.h"
 #include "almalloc.h"
 #include "alnumbers.h"
+#include "alspan.h"
 #include "opthelpers.h"
 #include "vector.h"
 
@@ -648,14 +649,14 @@ static NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc,
 static void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2)
 {
-    const v4sf vtaur{LD_PS1(-0.5f)};
-    const v4sf vtaui{LD_PS1(0.866025403784439f)};
+    const v4sf taur{LD_PS1(-0.5f)};
+    const v4sf taui{LD_PS1(0.866025403784439f)};
     for(int k{0};k < l1;++k)
     {
         v4sf cr2{VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
         ch[3*k*ido] = VADD(cc[k*ido], cr2);
-        ch[(3*k+2)*ido] = VMUL(vtaui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
-        ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(vtaur, cr2));
+        ch[(3*k+2)*ido] = VMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+        ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(taur, cr2));
     }
     if(ido == 1)
         return;
@@ -680,10 +681,10 @@ static void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf
             v4sf ci2{VADD(di2, di3)};
             ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
             ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
-            v4sf tr2{VADD(cc[i - 1 + k*ido], VMUL(vtaur, cr2))};
-            v4sf ti2{VADD(cc[i + k*ido], VMUL(vtaur, ci2))};
-            v4sf tr3{VMUL(vtaui, VSUB(di2, di3))};
-            v4sf ti3{VMUL(vtaui, VSUB(dr3, dr2))};
+            v4sf tr2{VADD(cc[i - 1 + k*ido], VMUL(taur, cr2))};
+            v4sf ti2{VADD(cc[i + k*ido], VMUL(taur, ci2))};
+            v4sf tr3{VMUL(taui, VSUB(di2, di3))};
+            v4sf ti3{VMUL(taui, VSUB(dr3, dr2))};
             ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
             ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
             ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
@@ -742,13 +743,14 @@ static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
     }
 } /* radb3 */
 
-static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
-        const float *RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
+static NEVER_INLINE(void) radf4_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
+    v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
+    const float *RESTRICT wa3)
 {
     const int l1ido{l1*ido};
     {
-        const v4sf *RESTRICT cc_ = cc, *RESTRICT cc_end = cc + l1ido;
-        v4sf *RESTRICT ch_ = ch;
+        const v4sf *RESTRICT cc_{cc}, *RESTRICT cc_end{cc + l1ido};
+        v4sf *RESTRICT ch_{ch};
         while(cc != cc_end)
         {
             // this loop represents between 25% and 40% of total radf4_ps cost !
@@ -942,11 +944,11 @@ static void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf
         v4sf ci5{VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2))};
         v4sf cr3{VADD(cc_ref(1, k, 4), cc_ref(1, k, 3))};
         v4sf ci4{VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3))};
-        ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
-        ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(VMUL(tr11, cr2), VMUL(tr12, cr3)));
-        ch_ref(1, 3, k) = VADD(VMUL(ti11, ci5), VMUL(ti12, ci4));
-        ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(VMUL(tr12, cr2), VMUL(tr11, cr3)));
-        ch_ref(1, 5, k) = VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4));
+        ch_ref(1, 1, k)   = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
+        ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VMADD(tr11, cr2, VMUL(tr12, cr3)));
+        ch_ref(1, 3, k)   = VMADD(ti11, ci5, VMUL(ti12, ci4));
+        ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VMADD(tr12, cr2, VMUL(tr11, cr3)));
+        ch_ref(1, 5, k)   = VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4));
         //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
     }
     if(ido == 1)
@@ -980,12 +982,12 @@ static void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf
             v4sf ci3{VADD(di3, di4)};
             ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
             ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));
-            v4sf tr2{VADD(cc_ref(i - 1, k, 1), VADD(VMUL(tr11, cr2), VMUL(tr12, cr3)))};
-            v4sf ti2{VSUB(cc_ref(i, k, 1), VADD(VMUL(tr11, ci2), VMUL(tr12, ci3)))};
-            v4sf tr3{VADD(cc_ref(i - 1, k, 1), VADD(VMUL(tr12, cr2), VMUL(tr11, cr3)))};
-            v4sf ti3{VSUB(cc_ref(i, k, 1), VADD(VMUL(tr12, ci2), VMUL(tr11, ci3)))};
-            v4sf tr5{VADD(VMUL(ti11, cr5), VMUL(ti12, cr4))};
-            v4sf ti5{VADD(VMUL(ti11, ci5), VMUL(ti12, ci4))};
+            v4sf tr2{VADD(cc_ref(i - 1, k, 1), VMADD(tr11, cr2, VMUL(tr12, cr3)))};
+            v4sf ti2{VSUB(cc_ref(i, k, 1), VMADD(tr11, ci2, VMUL(tr12, ci3)))};
+            v4sf tr3{VADD(cc_ref(i - 1, k, 1), VMADD(tr12, cr2, VMUL(tr11, cr3)))};
+            v4sf ti3{VSUB(cc_ref(i, k, 1), VMADD(tr12, ci2, VMUL(tr11, ci3)))};
+            v4sf tr5{VMADD(ti11, cr5, VMUL(ti12, cr4))};
+            v4sf ti5{VMADD(ti11, ci5, VMUL(ti12, ci4))};
             v4sf tr4{VSUB(VMUL(ti12, cr5), VMUL(ti11, cr4))};
             v4sf ti4{VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4))};
             ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
@@ -1027,9 +1029,9 @@ static void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf
         v4sf tr2{VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k))};
         v4sf tr3{VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k))};
         ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
-        v4sf cr2{VADD(cc_ref(1, 1, k), VADD(VMUL(tr11, tr2), VMUL(tr12, tr3)))};
-        v4sf cr3{VADD(cc_ref(1, 1, k), VADD(VMUL(tr12, tr2), VMUL(tr11, tr3)))};
-        v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))};
+        v4sf cr2{VADD(cc_ref(1, 1, k), VMADD(tr11, tr2, VMUL(tr12, tr3)))};
+        v4sf cr3{VADD(cc_ref(1, 1, k), VMADD(tr12, tr2, VMUL(tr11, tr3)))};
+        v4sf ci5{VMADD(ti11, ti5, VMUL(ti12, ti4))};
         v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))};
         ch_ref(1, k, 2) = VSUB(cr2, ci5);
         ch_ref(1, k, 3) = VSUB(cr3, ci4);
@@ -1055,12 +1057,12 @@ static void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf
             v4sf tr3{VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
             ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
             ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
-            v4sf cr2{VADD(cc_ref(i-1, 1, k), VADD(VMUL(tr11, tr2), VMUL(tr12, tr3)))};
-            v4sf ci2{VADD(cc_ref(i  , 1, k), VADD(VMUL(tr11, ti2), VMUL(tr12, ti3)))};
-            v4sf cr3{VADD(cc_ref(i-1, 1, k), VADD(VMUL(tr12, tr2), VMUL(tr11, tr3)))};
-            v4sf ci3{VADD(cc_ref(i  , 1, k), VADD(VMUL(tr12, ti2), VMUL(tr11, ti3)))};
-            v4sf cr5{VADD(VMUL(ti11, tr5), VMUL(ti12, tr4))};
-            v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))};
+            v4sf cr2{VADD(cc_ref(i-1, 1, k), VMADD(tr11, tr2, VMUL(tr12, tr3)))};
+            v4sf ci2{VADD(cc_ref(i  , 1, k), VMADD(tr11, ti2, VMUL(tr12, ti3)))};
+            v4sf cr3{VADD(cc_ref(i-1, 1, k), VMADD(tr12, tr2, VMUL(tr11, tr3)))};
+            v4sf ci3{VADD(cc_ref(i  , 1, k), VMADD(tr12, ti2, VMUL(tr11, ti3)))};
+            v4sf cr5{VMADD(ti11, tr5, VMUL(ti12, tr4))};
+            v4sf ci5{VMADD(ti11, ti5, VMUL(ti12, ti4))};
             v4sf cr4{VSUB(VMUL(ti12, tr5), VMUL(ti11, tr4))};
             v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))};
             v4sf dr3{VSUB(cr3, ci4)};
@@ -1210,18 +1212,16 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v
     return const_cast<v4sf*>(in); /* this is in fact the output .. */
 }
 
-static int decompose(const int n, int *ifac, const int *ntryh)
+static int decompose(const int n, int *ifac, const al::span<const int,4> ntryh)
 {
     int nl{n}, nf{0};
-    for(int j{0};ntryh[j];++j)
+    for(const int ntry : ntryh)
     {
-        const int ntry{ntryh[j]};
         while(nl != 1)
         {
             const int nq{nl / ntry};
-            const int nr{nl - ntry*nq};
-            if(nr != 0)
-                break;
+            const int nr{nl % ntry};
+            if(nr != 0) break;
 
             ifac[2+nf++] = ntry;
             nl = nq;
@@ -1245,7 +1245,7 @@ static int decompose(const int n, int *ifac, const int *ntryh)
 
 static void rffti1_ps(const int n, float *wa, int *ifac)
 {
-    static constexpr int ntryh[]{4,2,3,5,0};
+    static constexpr int ntryh[]{4,2,3,5};
 
     const int nf{decompose(n, ifac, ntryh)};
     const double argh{2.0*al::numbers::pi / n};
@@ -1279,7 +1279,7 @@ static void rffti1_ps(const int n, float *wa, int *ifac)
 
 void cffti1_ps(const int n, float *wa, int *ifac)
 {
-    static constexpr int ntryh[]{5,3,4,2,0};
+    static constexpr int ntryh[]{5,3,4,2};
 
     const int nf{decompose(n, ifac, ntryh)};
     const double argh{2.0*al::numbers::pi / n};
@@ -1299,8 +1299,8 @@ void cffti1_ps(const int n, float *wa, int *ifac)
             wa[i-1] = 1;
             wa[i] = 0;
             ld += l1;
-            double argld = ld*argh;
-            for(int ii = 4; ii <= idot; ii += 2)
+            const double argld{ld*argh};
+            for(int ii{4};ii <= idot;ii += 2)
             {
                 i += 2;
                 fi += 1;
@@ -1425,7 +1425,7 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
             const size_t j{static_cast<size_t>(k) % SIMD_SZ};
             for(size_t m{0};m < SIMD_SZ-1;++m)
             {
-                const double A = -2.0*al::numbers::pi*static_cast<double>(m+1)*k / N;
+                const double A{-2.0*al::numbers::pi*static_cast<double>(m+1)*k / N};
                 e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
                 e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
             }
-- 
cgit v1.2.3


From 71afec135f22f8586d82f6730b569f6f7c82e457 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Tue, 10 Oct 2023 05:09:53 -0700
Subject: Use an anonymous namespace instead of static functions

---
 common/pffft.cpp | 396 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 202 insertions(+), 194 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 2e9b6f67..e42751c5 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -72,6 +72,9 @@
 #include "opthelpers.h"
 #include "vector.h"
 
+
+namespace {
+
 #if defined(__GNUC__)
 #define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
 #define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
@@ -294,7 +297,7 @@ typedef float v4sf;
 #define assertv4(v,f0,f1,f2,f3) assert(v##_f[0] == (f0) && v##_f[1] == (f1) && v##_f[2] == (f2) && v##_f[3] == (f3))
 
 /* detect bugs with the vector support macros */
-void validate_pffft_simd()
+[[maybe_unused]] void validate_pffft_simd()
 {
     using float4 = std::array<float,4>;
     static constexpr float f[16]{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
@@ -343,17 +346,10 @@ void validate_pffft_simd()
 /* SSE and co like 16-bytes aligned pointers */
 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
 
-void *pffft_aligned_malloc(size_t nb_bytes)
-{ return al_malloc(MALLOC_V4SF_ALIGNMENT, nb_bytes); }
-
-void pffft_aligned_free(void *p) { al_free(p); }
-
-int pffft_simd_size() { return SIMD_SZ; }
-
 /*
   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
 */
-static NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
     const float *wa1, const float fsign)
 {
     const int l1ido{l1*ido};
@@ -390,26 +386,26 @@ static NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc,
 /*
   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
 */
-static NEVER_INLINE(void) passf3_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf3_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
     const float *wa1, const float *wa2, const float fsign)
 {
     assert(ido > 2);
 
-    const v4sf vtaur{LD_PS1(-0.5f)};
-    const v4sf vtaui{LD_PS1(0.866025403784439f*fsign)};
+    const v4sf taur{LD_PS1(-0.5f)};
+    const v4sf taui{LD_PS1(0.866025403784439f*fsign)};
     const int l1ido{l1*ido};
     for(int k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
     {
         for(int i{0};i < ido-1;i += 2)
         {
             v4sf tr2{VADD(cc[i+ido], cc[i+2*ido])};
-            v4sf cr2{VADD(cc[i], VMUL(vtaur,tr2))};
+            v4sf cr2{VADD(cc[i], VMUL(taur,tr2))};
             ch[i]  = VADD(cc[i], tr2);
             v4sf ti2{VADD(cc[i+ido+1], cc[i+2*ido+1])};
-            v4sf ci2{VADD(cc[i    +1], VMUL(vtaur,ti2))};
+            v4sf ci2{VADD(cc[i    +1], VMUL(taur,ti2))};
             ch[i+1] = VADD(cc[i+1], ti2);
-            v4sf cr3{VMUL(vtaui, VSUB(cc[i+ido], cc[i+2*ido]))};
-            v4sf ci3{VMUL(vtaui, VSUB(cc[i+ido+1], cc[i+2*ido+1]))};
+            v4sf cr3{VMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]))};
+            v4sf ci3{VMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]))};
             v4sf dr2{VSUB(cr2, ci3)};
             v4sf dr3{VADD(cr2, ci3)};
             v4sf di2{VADD(ci2, cr3)};
@@ -425,7 +421,7 @@ static NEVER_INLINE(void) passf3_ps(const int ido, const int l1, const v4sf *cc,
     }
 } /* passf3 */
 
-static NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
     const float *wa1, const float *wa2, const float *wa3, const float fsign)
 {
     /* fsign == -1 for forward transform and +1 for backward transform */
@@ -500,13 +496,13 @@ static NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc,
 /*
  * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
  */
-static NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
     const float *wa1, const float *wa2, const float *wa3, const float *wa4, const float fsign)
 {
-    const v4sf vtr11{LD_PS1(0.309016994374947f)};
-    const v4sf vtr12{LD_PS1(-0.809016994374947f)};
-    const v4sf vti11{LD_PS1(0.951056516295154f*fsign)};
-    const v4sf vti12{LD_PS1(0.587785252292473f*fsign)};
+    const v4sf tr11{LD_PS1(0.309016994374947f)};
+    const v4sf tr12{LD_PS1(-0.809016994374947f)};
+    const v4sf ti11{LD_PS1(0.951056516295154f*fsign)};
+    const v4sf ti12{LD_PS1(0.587785252292473f*fsign)};
 
 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + (a_1) + 1]
 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + (a_1) + 1]
@@ -526,14 +522,14 @@ static NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc,
             v4sf tr3{VADD(cc_ref(i-1, 3), cc_ref(i-1, 4))};
             ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
             ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
-            v4sf cr2{VADD(cc_ref(i-1, 1), VADD(VMUL(vtr11, tr2),VMUL(vtr12, tr3)))};
-            v4sf ci2{VADD(cc_ref(i  , 1), VADD(VMUL(vtr11, ti2),VMUL(vtr12, ti3)))};
-            v4sf cr3{VADD(cc_ref(i-1, 1), VADD(VMUL(vtr12, tr2),VMUL(vtr11, tr3)))};
-            v4sf ci3{VADD(cc_ref(i  , 1), VADD(VMUL(vtr12, ti2),VMUL(vtr11, ti3)))};
-            v4sf cr5{VADD(VMUL(vti11, tr5), VMUL(vti12, tr4))};
-            v4sf ci5{VADD(VMUL(vti11, ti5), VMUL(vti12, ti4))};
-            v4sf cr4{VSUB(VMUL(vti12, tr5), VMUL(vti11, tr4))};
-            v4sf ci4{VSUB(VMUL(vti12, ti5), VMUL(vti11, ti4))};
+            v4sf cr2{VADD(cc_ref(i-1, 1), VADD(VMUL(tr11, tr2),VMUL(tr12, tr3)))};
+            v4sf ci2{VADD(cc_ref(i  , 1), VADD(VMUL(tr11, ti2),VMUL(tr12, ti3)))};
+            v4sf cr3{VADD(cc_ref(i-1, 1), VADD(VMUL(tr12, tr2),VMUL(tr11, tr3)))};
+            v4sf ci3{VADD(cc_ref(i  , 1), VADD(VMUL(tr12, ti2),VMUL(tr11, ti3)))};
+            v4sf cr5{VADD(VMUL(ti11, tr5), VMUL(ti12, tr4))};
+            v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))};
+            v4sf cr4{VSUB(VMUL(ti12, tr5), VMUL(ti11, tr4))};
+            v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))};
             v4sf dr3{VSUB(cr3, ci4)};
             v4sf dr4{VADD(cr3, ci4)};
             v4sf di3{VADD(ci3, cr4)};
@@ -562,7 +558,7 @@ static NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc,
 #undef cc_ref
 }
 
-static NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
+NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
     v4sf *RESTRICT ch, const float *wa1)
 {
     const int l1ido{l1*ido};
@@ -601,7 +597,7 @@ static NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *REST
 } /* radf2 */
 
 
-static NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
     const float *wa1)
 {
     const int l1ido{l1*ido};
@@ -646,7 +642,7 @@ static NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc,
     }
 } /* radb2 */
 
-static void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2)
 {
     const v4sf taur{LD_PS1(-0.5f)};
@@ -694,7 +690,7 @@ static void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf
 } /* radf3 */
 
 
-static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
+void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
     const float *wa2)
 {
     static constexpr float taur{-0.5f};
@@ -743,7 +739,7 @@ static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
     }
 } /* radb3 */
 
-static NEVER_INLINE(void) radf4_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
+NEVER_INLINE(void) radf4_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
     v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
     const float *RESTRICT wa3)
 {
@@ -834,7 +830,7 @@ static NEVER_INLINE(void) radf4_ps(const int ido, const int l1, const v4sf *REST
 } /* radf4 */
 
 
-static NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT cc,
+NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT cc,
     v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
     const float *RESTRICT wa3)
 {
@@ -920,7 +916,7 @@ static NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RES
     }
 } /* radb4 */
 
-static void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
 {
     const v4sf tr11{LD_PS1(0.309016994374947f)};
@@ -1004,7 +1000,7 @@ static void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf
 #undef ch_ref
 } /* radf5 */
 
-static void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
 {
     const v4sf tr11{LD_PS1(0.309016994374947f)};
@@ -1088,8 +1084,8 @@ static void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf
 #undef ch_ref
 } /* radb5 */
 
-static NEVER_INLINE(v4sf *) rfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1,
-    v4sf *work2, const float *wa, const int *ifac)
+NEVER_INLINE(v4sf *) rfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+    const float *wa, const int *ifac)
 {
     assert(work1 != work2);
 
@@ -1150,8 +1146,8 @@ static NEVER_INLINE(v4sf *) rfftf1_ps(const int n, const v4sf *input_readonly, v
     return const_cast<v4sf*>(in); /* this is in fact the output .. */
 } /* rfftf1 */
 
-static NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v4sf *work1,
-    v4sf *work2, const float *wa, const int *ifac)
+NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+    const float *wa, const int *ifac)
 {
     assert(work1 != work2);
 
@@ -1212,7 +1208,69 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v
     return const_cast<v4sf*>(in); /* this is in fact the output .. */
 }
 
-static int decompose(const int n, int *ifac, const al::span<const int,4> ntryh)
+v4sf *cfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa,
+    const int *ifac, const float fsign)
+{
+    assert(work1 != work2);
+
+    const v4sf *in{input_readonly};
+    v4sf *out{in == work2 ? work1 : work2};
+    const int nf{ifac[1]};
+    int l1{1}, iw{0};
+    for(int k1{2};k1 <= nf+1;++k1)
+    {
+        const int ip{ifac[k1]};
+        const int l2{ip*l1};
+        const int ido{n / l2};
+        const int idot{ido + ido};
+        switch(ip)
+        {
+        case 5:
+            {
+                int ix2{iw + idot};
+                int ix3{ix2 + idot};
+                int ix4{ix3 + idot};
+                passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], fsign);
+            }
+            break;
+        case 4:
+            {
+                int ix2{iw + idot};
+                int ix3{ix2 + idot};
+                passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], fsign);
+            }
+            break;
+        case 3:
+            {
+                int ix2{iw + idot};
+                passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], fsign);
+            }
+            break;
+        case 2:
+            passf2_ps(idot, l1, in, out, &wa[iw], fsign);
+            break;
+        default:
+            assert(0);
+        }
+        l1 = l2;
+        iw += (ip - 1)*idot;
+        if(out == work2)
+        {
+            out = work1;
+            in = work2;
+        }
+        else
+        {
+            out = work2;
+            in = work1;
+        }
+    }
+
+    return const_cast<v4sf*>(in); /* this is in fact the output .. */
+}
+
+
+int decompose(const int n, int *ifac, const al::span<const int,4> ntryh)
 {
     int nl{n}, nf{0};
     for(const int ntry : ntryh)
@@ -1241,9 +1299,7 @@ static int decompose(const int n, int *ifac, const al::span<const int,4> ntryh)
     return nf;
 }
 
-
-
-static void rffti1_ps(const int n, float *wa, int *ifac)
+void rffti1_ps(const int n, float *wa, int *ifac)
 {
     static constexpr int ntryh[]{4,2,3,5};
 
@@ -1317,68 +1373,14 @@ void cffti1_ps(const int n, float *wa, int *ifac)
     }
 } /* cffti1 */
 
+} // namespace
 
-v4sf *cfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa,
-    const int *ifac, const float fsign)
-{
-    assert(work1 != work2);
-
-    const v4sf *in{input_readonly};
-    v4sf *out{in == work2 ? work1 : work2};
-    const int nf{ifac[1]};
-    int l1{1}, iw{0};
-    for(int k1{2};k1 <= nf+1;++k1)
-    {
-        const int ip{ifac[k1]};
-        const int l2{ip*l1};
-        const int ido{n / l2};
-        const int idot{ido + ido};
-        switch(ip)
-        {
-        case 5:
-            {
-                int ix2{iw + idot};
-                int ix3{ix2 + idot};
-                int ix4{ix3 + idot};
-                passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], fsign);
-            }
-            break;
-        case 4:
-            {
-                int ix2{iw + idot};
-                int ix3{ix2 + idot};
-                passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], fsign);
-            }
-            break;
-        case 3:
-            {
-                int ix2{iw + idot};
-                passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], fsign);
-            }
-            break;
-        case 2:
-            passf2_ps(idot, l1, in, out, &wa[iw], fsign);
-            break;
-        default:
-            assert(0);
-        }
-        l1 = l2;
-        iw += (ip - 1)*idot;
-        if(out == work2)
-        {
-            out = work1;
-            in = work2;
-        }
-        else
-        {
-            out = work2;
-            in = work1;
-        }
-    }
+void *pffft_aligned_malloc(size_t nb_bytes)
+{ return al_malloc(MALLOC_V4SF_ALIGNMENT, nb_bytes); }
 
-    return const_cast<v4sf*>(in); /* this is in fact the output .. */
-}
+void pffft_aligned_free(void *p) { al_free(p); }
 
+int pffft_simd_size() { return SIMD_SZ; }
 
 struct PFFFT_Setup {
     int N;
@@ -1460,8 +1462,10 @@ void pffft_destroy_setup(PFFFT_Setup *s)
 
 #if !defined(PFFFT_SIMD_DISABLE)
 
+namespace {
+
 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
-static void reversed_copy(const int N, const v4sf *in, const int in_stride, v4sf *out)
+void reversed_copy(const int N, const v4sf *in, const int in_stride, v4sf *out)
 {
     v4sf g0, g1;
     INTERLEAVE2(in[0], in[1], g0, g1);
@@ -1480,7 +1484,7 @@ static void reversed_copy(const int N, const v4sf *in, const int in_stride, v4sf
     *--out = VSWAPHL(g1, g0);
 }
 
-static void unreversed_copy(const int N, const v4sf *in, v4sf *out, const int out_stride)
+void unreversed_copy(const int N, const v4sf *in, v4sf *out, const int out_stride)
 {
     v4sf g0{in[0]}, g1{g0};
     ++in;
@@ -1499,58 +1503,6 @@ static void unreversed_copy(const int N, const v4sf *in, v4sf *out, const int ou
     UNINTERLEAVE2(h0, g1, out[0], out[1]);
 }
 
-void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
-{
-    assert(in != out);
-
-    const int N{setup->N}, Ncvec{setup->Ncvec};
-    const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
-    v4sf *vout{reinterpret_cast<v4sf*>(out)};
-    if(setup->transform == PFFFT_REAL)
-    {
-        const int dk{N/32};
-        if(direction == PFFFT_FORWARD)
-        {
-            for(int k{0};k < dk;++k)
-            {
-                INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
-                INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
-            }
-            reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2);
-            reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ);
-        }
-        else
-        {
-            for(int k{0};k < dk;++k)
-            {
-                UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
-                UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
-            }
-            unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8);
-            unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8);
-        }
-    }
-    else
-    {
-        if(direction == PFFFT_FORWARD)
-        {
-            for(int k{0};k < Ncvec;++k)
-            {
-                int kk{(k/4) + (k%4)*(Ncvec/4)};
-                INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
-            }
-        }
-        else
-        {
-            for(int k{0};k < Ncvec;++k)
-            {
-                int kk{(k/4) + (k%4)*(Ncvec/4)};
-                UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
-            }
-        }
-    }
-}
-
 void pffft_cplx_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     assert(in != out);
@@ -1630,8 +1582,8 @@ void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4s
 }
 
 
-static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1,
-    const v4sf *in, const v4sf *e, v4sf *out)
+ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
+    const v4sf *e, v4sf *out)
 {
     v4sf r0{*in0}, i0{*in1};
     v4sf r1{*in++}; v4sf i1{*in++};
@@ -1686,8 +1638,7 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
     *out++ = i3;
 }
 
-static NEVER_INLINE(void) pffft_real_finalize(const int Ncvec, const v4sf *in, v4sf *out,
-    const v4sf *e)
+NEVER_INLINE(void) pffft_real_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
 
@@ -1725,7 +1676,7 @@ static NEVER_INLINE(void) pffft_real_finalize(const int Ncvec, const v4sf *in, v
         pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
 }
 
-static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
+ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
     const bool first)
 {
     v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
@@ -1777,8 +1728,7 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf
     *out++ = i3;
 }
 
-static NEVER_INLINE(void) pffft_real_preprocess(const int Ncvec, const v4sf *in, v4sf *out,
-    const v4sf *e)
+NEVER_INLINE(void) pffft_real_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
 
@@ -1897,6 +1847,60 @@ void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *vout
     }
 }
 
+} // namespace
+
+void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
+{
+    assert(in != out);
+
+    const int N{setup->N}, Ncvec{setup->Ncvec};
+    const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
+    v4sf *vout{reinterpret_cast<v4sf*>(out)};
+    if(setup->transform == PFFFT_REAL)
+    {
+        const int dk{N/32};
+        if(direction == PFFFT_FORWARD)
+        {
+            for(int k{0};k < dk;++k)
+            {
+                INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+                INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+            }
+            reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2);
+            reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ);
+        }
+        else
+        {
+            for(int k{0};k < dk;++k)
+            {
+                UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+                UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+            }
+            unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8);
+            unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8);
+        }
+    }
+    else
+    {
+        if(direction == PFFFT_FORWARD)
+        {
+            for(int k{0};k < Ncvec;++k)
+            {
+                int kk{(k/4) + (k%4)*(Ncvec/4)};
+                INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+            }
+        }
+        else
+        {
+            for(int k{0};k < Ncvec;++k)
+            {
+                int kk{(k/4) + (k%4)*(Ncvec/4)};
+                UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+            }
+        }
+    }
+}
+
 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
     float scaling)
 {
@@ -2020,34 +2024,7 @@ void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *outp
 
 // standard routine using scalar floats, without SIMD stuff.
 
-#define pffft_zreorder_nosimd pffft_zreorder
-void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
-    pffft_direction_t direction)
-{
-    const int N{setup->N};
-    if(setup->transform == PFFFT_COMPLEX)
-    {
-        for(int k{0};k < 2*N;++k)
-            out[k] = in[k];
-        return;
-    }
-    else if(direction == PFFFT_FORWARD)
-    {
-        float x_N{in[N-1]};
-        for(int k{N-1};k > 1;--k)
-            out[k] = in[k-1];
-        out[0] = in[0];
-        out[1] = x_N;
-    }
-    else
-    {
-        float x_N{in[1]};
-        for(int k{1};k < N-1;++k)
-            out[k] = in[k+1];
-        out[0] = in[0];
-        out[N-1] = x_N;
-    }
-}
+namespace {
 
 #define pffft_transform_internal_nosimd pffft_transform_internal
 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
@@ -2104,6 +2081,37 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
     }
 }
 
+} // namespace
+
+#define pffft_zreorder_nosimd pffft_zreorder
+void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
+    pffft_direction_t direction)
+{
+    const int N{setup->N};
+    if(setup->transform == PFFFT_COMPLEX)
+    {
+        for(int k{0};k < 2*N;++k)
+            out[k] = in[k];
+        return;
+    }
+    else if(direction == PFFFT_FORWARD)
+    {
+        float x_N{in[N-1]};
+        for(int k{N-1};k > 1;--k)
+            out[k] = in[k-1];
+        out[0] = in[0];
+        out[1] = x_N;
+    }
+    else
+    {
+        float x_N{in[1]};
+        for(int k{1};k < N-1;++k)
+            out[k] = in[k+1];
+        out[0] = in[0];
+        out[N-1] = x_N;
+    }
+}
+
 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
 void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab,
     float scaling)
-- 
cgit v1.2.3


From 4ef42544d8223060632d4f073b0fa4c059053619 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Tue, 10 Oct 2023 06:08:06 -0700
Subject: Use size_t and uint for non-negative values and indices

---
 common/pffft.cpp | 416 +++++++++++++++++++++++++++----------------------------
 common/pffft.h   |   4 +-
 2 files changed, 210 insertions(+), 210 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index e42751c5..1ec7514a 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -75,6 +75,8 @@
 
 namespace {
 
+using uint = unsigned int;
+
 #if defined(__GNUC__)
 #define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
 #define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
@@ -349,13 +351,13 @@ typedef float v4sf;
 /*
   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
 */
-NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
     const float *wa1, const float fsign)
 {
-    const int l1ido{l1*ido};
+    const size_t l1ido{l1*ido};
     if(ido <= 2)
     {
-        for(int k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
+        for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
         {
             ch[0]         = VADD(cc[0], cc[ido+0]);
             ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
@@ -366,9 +368,9 @@ NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc, v4sf *
     else
     {
         const v4sf vsign{LD_PS1(fsign)};
-        for(int k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
+        for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
         {
-            for(int i{0};i < ido-1;i += 2)
+            for(size_t i{0};i < ido-1;i += 2)
             {
                 v4sf tr2{VSUB(cc[i+0], cc[i+ido+0])};
                 v4sf ti2{VSUB(cc[i+1], cc[i+ido+1])};
@@ -386,17 +388,17 @@ NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc, v4sf *
 /*
   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
 */
-NEVER_INLINE(void) passf3_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
     const float *wa1, const float *wa2, const float fsign)
 {
     assert(ido > 2);
 
     const v4sf taur{LD_PS1(-0.5f)};
     const v4sf taui{LD_PS1(0.866025403784439f*fsign)};
-    const int l1ido{l1*ido};
-    for(int k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
+    const size_t l1ido{l1*ido};
+    for(size_t k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
     {
-        for(int i{0};i < ido-1;i += 2)
+        for(size_t i{0};i < ido-1;i += 2)
         {
             v4sf tr2{VADD(cc[i+ido], cc[i+2*ido])};
             v4sf cr2{VADD(cc[i], VMUL(taur,tr2))};
@@ -421,15 +423,15 @@ NEVER_INLINE(void) passf3_ps(const int ido, const int l1, const v4sf *cc, v4sf *
     }
 } /* passf3 */
 
-NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
     const float *wa1, const float *wa2, const float *wa3, const float fsign)
 {
     /* fsign == -1 for forward transform and +1 for backward transform */
     const v4sf vsign{LD_PS1(fsign)};
-    const int l1ido{l1*ido};
+    const size_t l1ido{l1*ido};
     if(ido == 2)
     {
-        for(int k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido)
+        for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido)
         {
             v4sf tr1{VSUB(cc[0], cc[2*ido + 0])};
             v4sf tr2{VADD(cc[0], cc[2*ido + 0])};
@@ -452,9 +454,9 @@ NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc, v4sf *
     }
     else
     {
-        for(int k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido)
+        for(size_t k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido)
         {
-            for(int i{0};i < ido-1;i+=2)
+            for(size_t i{0};i < ido-1;i+=2)
             {
                 v4sf tr1{VSUB(cc[i + 0], cc[i + 2*ido + 0])};
                 v4sf tr2{VADD(cc[i + 0], cc[i + 2*ido + 0])};
@@ -496,7 +498,7 @@ NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc, v4sf *
 /*
  * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
  */
-NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
     const float *wa1, const float *wa2, const float *wa3, const float *wa4, const float fsign)
 {
     const v4sf tr11{LD_PS1(0.309016994374947f)};
@@ -508,9 +510,9 @@ NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc, v4sf *
 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + (a_1) + 1]
 
     assert(ido > 2);
-    for(int k{0};k < l1;++k, cc += 5*ido, ch += ido)
+    for(size_t k{0};k < l1;++k, cc += 5*ido, ch += ido)
     {
-        for(int i{0};i < ido-1;i += 2)
+        for(size_t i{0};i < ido-1;i += 2)
         {
             v4sf ti5{VSUB(cc_ref(i  , 2), cc_ref(i  , 5))};
             v4sf ti2{VADD(cc_ref(i  , 2), cc_ref(i  , 5))};
@@ -558,11 +560,11 @@ NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc, v4sf *
 #undef cc_ref
 }
 
-NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
+NEVER_INLINE(void) radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
     v4sf *RESTRICT ch, const float *wa1)
 {
-    const int l1ido{l1*ido};
-    for(int k{0};k < l1ido;k += ido)
+    const size_t l1ido{l1*ido};
+    for(size_t k{0};k < l1ido;k += ido)
     {
         v4sf a{cc[k]}, b{cc[k + l1ido]};
         ch[2*k] = VADD(a, b);
@@ -572,9 +574,9 @@ NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc
         return;
     if(ido != 2)
     {
-        for(int k{0};k < l1ido;k += ido)
+        for(size_t k{0};k < l1ido;k += ido)
         {
-            for(int i{2};i < ido;i += 2)
+            for(size_t i{2};i < ido;i += 2)
             {
                 v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
                 v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
@@ -589,7 +591,7 @@ NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc
             return;
     }
     const v4sf minus_one{LD_PS1(-1.0f)};
-    for(int k{0};k < l1ido;k += ido)
+    for(size_t k{0};k < l1ido;k += ido)
     {
         ch[2*k + ido] = VMUL(minus_one, cc[ido-1 + k + l1ido]);
         ch[2*k + ido-1] = cc[k + ido-1];
@@ -597,11 +599,11 @@ NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc
 } /* radf2 */
 
 
-NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
     const float *wa1)
 {
-    const int l1ido{l1*ido};
-    for(int k{0};k < l1ido;k += ido)
+    const size_t l1ido{l1*ido};
+    for(size_t k{0};k < l1ido;k += ido)
     {
         v4sf a{cc[2*k]};
         v4sf b{cc[2*(k+ido) - 1]};
@@ -612,9 +614,9 @@ NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *c
         return;
     if(ido != 2)
     {
-        for(int k{0};k < l1ido;k += ido)
+        for(size_t k{0};k < l1ido;k += ido)
         {
-            for(int i{2};i < ido;i += 2)
+            for(size_t i{2};i < ido;i += 2)
             {
                 v4sf a{cc[i-1 + 2*k]};
                 v4sf b{cc[2*(k + ido) - i - 1]};
@@ -633,7 +635,7 @@ NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *c
             return;
     }
     const v4sf minus_two{LD_PS1(-2.0f)};
-    for(int k{0};k < l1ido;k += ido)
+    for(size_t k{0};k < l1ido;k += ido)
     {
         v4sf a{cc[2*k + ido-1]};
         v4sf b{cc[2*k + ido]};
@@ -642,12 +644,12 @@ NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *c
     }
 } /* radb2 */
 
-void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2)
 {
     const v4sf taur{LD_PS1(-0.5f)};
     const v4sf taui{LD_PS1(0.866025403784439f)};
-    for(int k{0};k < l1;++k)
+    for(size_t k{0};k < l1;++k)
     {
         v4sf cr2{VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
         ch[3*k*ido] = VADD(cc[k*ido], cr2);
@@ -656,11 +658,11 @@ void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRI
     }
     if(ido == 1)
         return;
-    for(int k{0};k < l1;++k)
+    for(size_t k{0};k < l1;++k)
     {
-        for(int i{2};i < ido;i += 2)
+        for(size_t i{2};i < ido;i += 2)
         {
-            const int ic{ido - i};
+            const size_t ic{ido - i};
             v4sf wr1{LD_PS1(wa1[i - 2])};
             v4sf wi1{LD_PS1(wa1[i - 1])};
             v4sf dr2{cc[i - 1 + (k + l1)*ido]};
@@ -690,8 +692,8 @@ void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRI
 } /* radf3 */
 
 
-void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
-    const float *wa2)
+void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+    const float *wa1, const float *wa2)
 {
     static constexpr float taur{-0.5f};
     static constexpr float taui{0.866025403784439f};
@@ -699,7 +701,7 @@ void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const
 
     const v4sf vtaur{LD_PS1(taur)};
     const v4sf vtaui_2{LD_PS1(taui_2)};
-    for(int k{0};k < l1;++k)
+    for(size_t k{0};k < l1;++k)
     {
         v4sf tr2 = cc[ido-1 + (3*k + 1)*ido];
         tr2 = VADD(tr2,tr2);
@@ -712,11 +714,11 @@ void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const
     if(ido == 1)
         return;
     const v4sf vtaui{LD_PS1(taui)};
-    for(int k{0};k < l1;++k)
+    for(size_t k{0};k < l1;++k)
     {
-        for(int i{2};i < ido;i += 2)
+        for(size_t i{2};i < ido;i += 2)
         {
-            const int ic{ido - i};
+            const size_t ic{ido - i};
             v4sf tr2{VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])};
             v4sf cr2{VMADD(vtaur, tr2, cc[i - 1 + 3*k*ido])};
             ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
@@ -739,11 +741,11 @@ void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const
     }
 } /* radb3 */
 
-NEVER_INLINE(void) radf4_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
+NEVER_INLINE(void) radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
     v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
     const float *RESTRICT wa3)
 {
-    const int l1ido{l1*ido};
+    const size_t l1ido{l1*ido};
     {
         const v4sf *RESTRICT cc_{cc}, *RESTRICT cc_end{cc + l1ido};
         v4sf *RESTRICT ch_{ch};
@@ -767,12 +769,12 @@ NEVER_INLINE(void) radf4_ps(const int ido, const int l1, const v4sf *RESTRICT cc
         return;
     if(ido != 2)
     {
-        for(int k{0};k < l1ido;k += ido)
+        for(size_t k{0};k < l1ido;k += ido)
         {
             const v4sf *RESTRICT pc{cc + 1 + k};
-            for(int i{2};i < ido;i += 2, pc += 2)
+            for(size_t i{2};i < ido;i += 2, pc += 2)
             {
-                const int ic{ido - i};
+                const size_t ic{ido - i};
 
                 v4sf cr2{pc[1*l1ido+0]};
                 v4sf ci2{pc[1*l1ido+1]};
@@ -816,7 +818,7 @@ NEVER_INLINE(void) radf4_ps(const int ido, const int l1, const v4sf *RESTRICT cc
             return;
     }
     const v4sf minus_hsqt2{LD_PS1(al::numbers::sqrt2_v<float> * -0.5f)};
-    for(int k{0};k < l1ido;k += ido)
+    for(size_t k{0};k < l1ido;k += ido)
     {
         v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]};
         v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]};
@@ -830,12 +832,12 @@ NEVER_INLINE(void) radf4_ps(const int ido, const int l1, const v4sf *RESTRICT cc
 } /* radf4 */
 
 
-NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT cc,
+NEVER_INLINE(void) radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
     v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
     const float *RESTRICT wa3)
 {
     const v4sf two{LD_PS1(2.0f)};
-    const int l1ido{l1*ido};
+    const size_t l1ido{l1*ido};
     {
         const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido};
         v4sf *ch_{ch};
@@ -860,11 +862,11 @@ NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT c
         return;
     if(ido != 2)
     {
-        for(int k{0};k < l1ido;k += ido)
+        for(size_t k{0};k < l1ido;k += ido)
         {
             const v4sf *RESTRICT pc{cc - 1 + 4*k};
             v4sf *RESTRICT ph{ch + k + 1};
-            for(int i{2};i < ido;i += 2)
+            for(size_t i{2};i < ido;i += 2)
             {
                 v4sf tr1{VSUB(pc[i], pc[4*ido - i])};
                 v4sf tr2{VADD(pc[i], pc[4*ido - i])};
@@ -900,9 +902,9 @@ NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT c
             return;
     }
     const v4sf minus_sqrt2{LD_PS1(-1.414213562373095f)};
-    for(int k{0};k < l1ido;k += ido)
+    for(size_t k{0};k < l1ido;k += ido)
     {
-        const int i0{4*k + ido};
+        const size_t i0{4*k + ido};
         v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]};
         v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]};
         v4sf tr1{VSUB(c,d)};
@@ -916,7 +918,7 @@ NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT c
     }
 } /* radb4 */
 
-void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
 {
     const v4sf tr11{LD_PS1(0.309016994374947f)};
@@ -928,13 +930,13 @@ void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRI
 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
 
     /* Parameter adjustments */
-    const int ch_offset{1 + ido * 6};
+    const size_t ch_offset{1 + ido * 6};
     ch -= ch_offset;
-    const int cc_offset{1 + ido * (1 + l1)};
+    const size_t cc_offset{1 + ido * (1 + l1)};
     cc -= cc_offset;
 
     /* Function Body */
-    for(int k{1};k <= l1;++k)
+    for(size_t k{1};k <= l1;++k)
     {
         v4sf cr2{VADD(cc_ref(1, k, 5), cc_ref(1, k, 2))};
         v4sf ci5{VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2))};
@@ -950,12 +952,12 @@ void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRI
     if(ido == 1)
       return;
 
-    const int idp2{ido + 2};
-    for(int k{1};k <= l1;++k)
+    const size_t idp2{ido + 2};
+    for(size_t k{1};k <= l1;++k)
     {
-        for(int i{3};i <= ido;i += 2)
+        for(size_t i{3};i <= ido;i += 2)
         {
-            const int ic{idp2 - i};
+            const size_t ic{idp2 - i};
             v4sf dr2{LD_PS1(wa1[i-3])};
             v4sf di2{LD_PS1(wa1[i-2])};
             v4sf dr3{LD_PS1(wa2[i-3])};
@@ -1000,7 +1002,7 @@ void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRI
 #undef ch_ref
 } /* radf5 */
 
-void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
 {
     const v4sf tr11{LD_PS1(0.309016994374947f)};
@@ -1012,13 +1014,13 @@ void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRI
 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
 
     /* Parameter adjustments */
-    const int ch_offset{1 + ido*(1 + l1)};
+    const size_t ch_offset{1 + ido*(1 + l1)};
     ch -= ch_offset;
-    const int cc_offset{1 + ido*6};
+    const size_t cc_offset{1 + ido*6};
     cc -= cc_offset;
 
     /* Function Body */
-    for(int k{1};k <= l1;++k)
+    for(size_t k{1};k <= l1;++k)
     {
         v4sf ti5{VADD(cc_ref(1, 3, k), cc_ref(1, 3, k))};
         v4sf ti4{VADD(cc_ref(1, 5, k), cc_ref(1, 5, k))};
@@ -1037,12 +1039,12 @@ void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRI
     if(ido == 1)
         return;
 
-    const int idp2{ido + 2};
-    for(int k{1};k <= l1;++k)
+    const size_t idp2{ido + 2};
+    for(size_t k{1};k <= l1;++k)
     {
-        for(int i{3};i <= ido;i += 2)
+        for(size_t i{3};i <= ido;i += 2)
         {
-            const int ic{idp2 - i};
+            const size_t ic{idp2 - i};
             v4sf ti5{VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k))};
             v4sf ti2{VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k))};
             v4sf ti4{VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k))};
@@ -1084,43 +1086,43 @@ void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRI
 #undef ch_ref
 } /* radb5 */
 
-NEVER_INLINE(v4sf *) rfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
-    const float *wa, const int *ifac)
+NEVER_INLINE(v4sf *) rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1,
+    v4sf *work2, const float *wa, const al::span<const uint,15> ifac)
 {
     assert(work1 != work2);
 
     const v4sf *in{input_readonly};
     v4sf *out{in == work2 ? work1 : work2};
-    const int nf{ifac[1]};
-    int l2{n};
-    int iw{n-1};
-    for(int k1{1};k1 <= nf;++k1)
+    const size_t nf{ifac[1]};
+    size_t l2{n};
+    size_t iw{n-1};
+    for(size_t k1{1};k1 <= nf;++k1)
     {
-        int kh{nf - k1};
-        int ip{ifac[kh + 2]};
-        int l1{l2 / ip};
-        int ido{n / l2};
+        size_t kh{nf - k1};
+        size_t ip{ifac[kh + 2]};
+        size_t l1{l2 / ip};
+        size_t ido{n / l2};
         iw -= (ip - 1)*ido;
         switch(ip)
         {
         case 5:
             {
-                int ix2{iw + ido};
-                int ix3{ix2 + ido};
-                int ix4{ix3 + ido};
+                size_t ix2{iw + ido};
+                size_t ix3{ix2 + ido};
+                size_t ix4{ix3 + ido};
                 radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
             }
             break;
         case 4:
             {
-                int ix2{iw + ido};
-                int ix3{ix2 + ido};
+                size_t ix2{iw + ido};
+                size_t ix3{ix2 + ido};
                 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
             }
             break;
         case 3:
             {
-                int ix2{iw + ido};
+                size_t ix2{iw + ido};
                 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
             }
             break;
@@ -1146,41 +1148,41 @@ NEVER_INLINE(v4sf *) rfftf1_ps(const int n, const v4sf *input_readonly, v4sf *wo
     return const_cast<v4sf*>(in); /* this is in fact the output .. */
 } /* rfftf1 */
 
-NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
-    const float *wa, const int *ifac)
+NEVER_INLINE(v4sf *) rfftb1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1,
+    v4sf *work2, const float *wa, const al::span<const uint,15> ifac)
 {
     assert(work1 != work2);
 
     const v4sf *in{input_readonly};
     v4sf *out{in == work2 ? work1 : work2};
-    const int nf{ifac[1]};
-    int l1{1};
-    int iw{0};
-    for(int k1{1};k1 <= nf;++k1)
+    const size_t nf{ifac[1]};
+    size_t l1{1};
+    size_t iw{0};
+    for(size_t k1{1};k1 <= nf;++k1)
     {
-        int ip{ifac[k1 + 1]};
-        int l2{ip*l1};
-        int ido{n / l2};
+        size_t ip{ifac[k1 + 1]};
+        size_t l2{ip*l1};
+        size_t ido{n / l2};
         switch(ip)
         {
         case 5:
             {
-                int ix2{iw + ido};
-                int ix3{ix2 + ido};
-                int ix4{ix3 + ido};
+                size_t ix2{iw + ido};
+                size_t ix3{ix2 + ido};
+                size_t ix4{ix3 + ido};
                 radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
             }
             break;
         case 4:
             {
-                int ix2{iw + ido};
-                int ix3{ix2 + ido};
+                size_t ix2{iw + ido};
+                size_t ix3{ix2 + ido};
                 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
             }
             break;
         case 3:
             {
-                int ix2{iw + ido};
+                size_t ix2{iw + ido};
                 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
             }
             break;
@@ -1208,41 +1210,41 @@ NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v4sf *wo
     return const_cast<v4sf*>(in); /* this is in fact the output .. */
 }
 
-v4sf *cfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa,
-    const int *ifac, const float fsign)
+v4sf *cfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+    const float *wa, const al::span<const uint,15> ifac, const float fsign)
 {
     assert(work1 != work2);
 
     const v4sf *in{input_readonly};
     v4sf *out{in == work2 ? work1 : work2};
-    const int nf{ifac[1]};
-    int l1{1}, iw{0};
-    for(int k1{2};k1 <= nf+1;++k1)
+    const size_t nf{ifac[1]};
+    size_t l1{1}, iw{0};
+    for(size_t k1{2};k1 <= nf+1;++k1)
     {
-        const int ip{ifac[k1]};
-        const int l2{ip*l1};
-        const int ido{n / l2};
-        const int idot{ido + ido};
+        const size_t ip{ifac[k1]};
+        const size_t l2{ip*l1};
+        const size_t ido{n / l2};
+        const size_t idot{ido + ido};
         switch(ip)
         {
         case 5:
             {
-                int ix2{iw + idot};
-                int ix3{ix2 + idot};
-                int ix4{ix3 + idot};
+                size_t ix2{iw + idot};
+                size_t ix3{ix2 + idot};
+                size_t ix4{ix3 + idot};
                 passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], fsign);
             }
             break;
         case 4:
             {
-                int ix2{iw + idot};
-                int ix3{ix2 + idot};
+                size_t ix2{iw + idot};
+                size_t ix3{ix2 + idot};
                 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], fsign);
             }
             break;
         case 3:
             {
-                int ix2{iw + idot};
+                size_t ix2{iw + idot};
                 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], fsign);
             }
             break;
@@ -1270,24 +1272,24 @@ v4sf *cfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work
 }
 
 
-int decompose(const int n, int *ifac, const al::span<const int,4> ntryh)
+uint decompose(const uint n, const al::span<uint,15> ifac, const al::span<const uint,4> ntryh)
 {
-    int nl{n}, nf{0};
-    for(const int ntry : ntryh)
+    uint nl{n}, nf{0};
+    for(const uint ntry : ntryh)
     {
         while(nl != 1)
         {
-            const int nq{nl / ntry};
-            const int nr{nl % ntry};
+            const uint nq{nl / ntry};
+            const uint nr{nl % ntry};
             if(nr != 0) break;
 
             ifac[2+nf++] = ntry;
             nl = nq;
             if(ntry == 2 && nf != 1)
             {
-                for(int i{2};i <= nf;++i)
+                for(size_t i{2};i <= nf;++i)
                 {
-                    int ib{nf - i + 2};
+                    size_t ib{nf - i + 2};
                     ifac[ib + 1] = ifac[ib];
                 }
                 ifac[2] = 2;
@@ -1299,33 +1301,32 @@ int decompose(const int n, int *ifac, const al::span<const int,4> ntryh)
     return nf;
 }
 
-void rffti1_ps(const int n, float *wa, int *ifac)
+void rffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
 {
-    static constexpr int ntryh[]{4,2,3,5};
+    static constexpr uint ntryh[]{4,2,3,5};
 
-    const int nf{decompose(n, ifac, ntryh)};
+    const uint nf{decompose(n, ifac, ntryh)};
     const double argh{2.0*al::numbers::pi / n};
-    int is{0};
-    int nfm1{nf - 1};
-    int l1{1};
-    for(int k1{1};k1 <= nfm1;++k1)
+    size_t is{0};
+    size_t nfm1{nf - 1};
+    size_t l1{1};
+    for(size_t k1{1};k1 <= nfm1;++k1)
     {
-        const int ip{ifac[k1 + 1]};
-        const int l2{l1*ip};
-        const int ido{n / l2};
-        const int ipm{ip - 1};
+        const size_t ip{ifac[k1 + 1]};
+        const size_t l2{l1*ip};
+        const size_t ido{n / l2};
+        const size_t ipm{ip - 1};
         int ld{0};
-        for(int j{1};j <= ipm;++j)
+        for(size_t j{1};j <= ipm;++j)
         {
-            int i{is}, fi{0};
+            size_t i{is}, fi{0};
             ld += l1;
             double argld{ld*argh};
-            for(int ii{3};ii <= ido;ii += 2)
+            for(size_t ii{3};ii <= ido;ii += 2)
             {
-                i += 2;
                 fi += 1;
-                wa[i - 2] = static_cast<float>(std::cos(fi*argld));
-                wa[i - 1] = static_cast<float>(std::sin(fi*argld));
+                wa[i++] = static_cast<float>(std::cos(static_cast<double>(fi)*argld));
+                wa[i++] = static_cast<float>(std::sin(static_cast<double>(fi)*argld));
             }
             is += ido;
         }
@@ -1333,35 +1334,34 @@ void rffti1_ps(const int n, float *wa, int *ifac)
     }
 } /* rffti1 */
 
-void cffti1_ps(const int n, float *wa, int *ifac)
+void cffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
 {
-    static constexpr int ntryh[]{5,3,4,2};
+    static constexpr uint ntryh[]{5,3,4,2};
 
-    const int nf{decompose(n, ifac, ntryh)};
+    const uint nf{decompose(n, ifac, ntryh)};
     const double argh{2.0*al::numbers::pi / n};
-    int i{1};
-    int l1{1};
-    for(int k1{1};k1 <= nf;++k1)
+    size_t i{1};
+    size_t l1{1};
+    for(size_t k1{1};k1 <= nf;++k1)
     {
-        const int ip{ifac[k1+1]};
-        const int l2{l1*ip};
-        const int ido{n / l2};
-        const int idot{ido + ido + 2};
-        const int ipm{ip - 1};
-        int ld{0};
-        for(int j{1};j <= ipm;++j)
+        const size_t ip{ifac[k1+1]};
+        const size_t l2{l1*ip};
+        const size_t ido{n / l2};
+        const size_t idot{ido + ido + 2};
+        const size_t ipm{ip - 1};
+        size_t ld{0};
+        for(size_t j{1};j <= ipm;++j)
         {
-            int i1{i}, fi{0};
+            size_t i1{i}, fi{0};
             wa[i-1] = 1;
             wa[i] = 0;
             ld += l1;
             const double argld{ld*argh};
-            for(int ii{4};ii <= idot;ii += 2)
+            for(size_t ii{4};ii <= idot;ii += 2)
             {
-                i += 2;
                 fi += 1;
-                wa[i-1] = static_cast<float>(std::cos(fi*argld));
-                wa[i]   = static_cast<float>(std::sin(fi*argld));
+                wa[++i] = static_cast<float>(std::cos(static_cast<double>(fi)*argld));
+                wa[++i] = static_cast<float>(std::sin(static_cast<double>(fi)*argld));
             }
             if(ip > 5)
             {
@@ -1383,16 +1383,16 @@ void pffft_aligned_free(void *p) { al_free(p); }
 int pffft_simd_size() { return SIMD_SZ; }
 
 struct PFFFT_Setup {
-    int N;
-    int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
-    int ifac[15];
+    uint N;
+    uint Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
+    std::array<uint,15> ifac;
     pffft_transform_t transform;
 
     float *twiddle; // N/4 elements
     alignas(MALLOC_V4SF_ALIGNMENT) v4sf e[1]; // N/4*3 elements
 };
 
-PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
+PFFFT_Setup *pffft_new_setup(unsigned int N, pffft_transform_t transform)
 {
     assert(transform == PFFFT_REAL || transform == PFFFT_COMPLEX);
     assert(N > 0);
@@ -1405,7 +1405,7 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
     else
         assert((N%(SIMD_SZ*SIMD_SZ)) == 0);
 
-    const auto Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+    const uint Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
     const size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))};
 
     void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)};
@@ -1415,19 +1415,19 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
     s->N = N;
     s->transform = transform;
     /* nb of complex simd vectors */
-    s->Ncvec = static_cast<int>(Ncvec);
+    s->Ncvec = Ncvec;
     s->twiddle = reinterpret_cast<float*>(&s->e[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]);
 
     if constexpr(SIMD_SZ > 1)
     {
         al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1));
-        for(int k{0};k < s->Ncvec;++k)
+        for(size_t k{0};k < s->Ncvec;++k)
         {
-            const size_t i{static_cast<size_t>(k) / SIMD_SZ};
-            const size_t j{static_cast<size_t>(k) % SIMD_SZ};
+            const size_t i{k / SIMD_SZ};
+            const size_t j{k % SIMD_SZ};
             for(size_t m{0};m < SIMD_SZ-1;++m)
             {
-                const double A{-2.0*al::numbers::pi*static_cast<double>(m+1)*k / N};
+                const double A{-2.0*al::numbers::pi*static_cast<double>((m+1)*k) / N};
                 e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
                 e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
             }
@@ -1440,8 +1440,8 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
         cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
 
     /* check that N is decomposable with allowed prime factors */
-    int m{1};
-    for(int k{0};k < s->ifac[1];++k)
+    size_t m{1};
+    for(size_t k{0};k < s->ifac[1];++k)
         m *= s->ifac[2+k];
 
     if(m != N/SIMD_SZ)
@@ -1465,14 +1465,14 @@ void pffft_destroy_setup(PFFFT_Setup *s)
 namespace {
 
 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
-void reversed_copy(const int N, const v4sf *in, const int in_stride, v4sf *out)
+void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *out)
 {
     v4sf g0, g1;
     INTERLEAVE2(in[0], in[1], g0, g1);
     in += in_stride;
 
     *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
-    for(int k{1};k < N;++k)
+    for(size_t k{1};k < N;++k)
     {
         v4sf h0, h1;
         INTERLEAVE2(in[0], in[1], h0, h1);
@@ -1484,11 +1484,11 @@ void reversed_copy(const int N, const v4sf *in, const int in_stride, v4sf *out)
     *--out = VSWAPHL(g1, g0);
 }
 
-void unreversed_copy(const int N, const v4sf *in, v4sf *out, const int out_stride)
+void unreversed_copy(const size_t N, const v4sf *in, v4sf *out, const int out_stride)
 {
     v4sf g0{in[0]}, g1{g0};
     ++in;
-    for(int k{1};k < N;++k)
+    for(size_t k{1};k < N;++k)
     {
         v4sf h0{*in++}; v4sf h1{*in++};
         g1 = VSWAPHL(g1, h0);
@@ -1503,12 +1503,12 @@ void unreversed_copy(const int N, const v4sf *in, v4sf *out, const int out_strid
     UNINTERLEAVE2(h0, g1, out[0], out[1]);
 }
 
-void pffft_cplx_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     assert(in != out);
 
-    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
-    for(int k{0};k < dk;++k)
+    const size_t dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+    for(size_t k{0};k < dk;++k)
     {
         v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
@@ -1547,12 +1547,12 @@ void pffft_cplx_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf
     }
 }
 
-void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     assert(in != out);
 
-    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
-    for(int k{0};k < dk;++k)
+    const size_t dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+    for(size_t k{0};k < dk;++k)
     {
         v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
@@ -1638,12 +1638,12 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
     *out++ = i3;
 }
 
-NEVER_INLINE(void) pffft_real_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+NEVER_INLINE(void) pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
 
     assert(in != out);
-    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+    const size_t dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
     /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
 
     const v4sf zero{VZERO()};
@@ -1672,7 +1672,7 @@ NEVER_INLINE(void) pffft_real_finalize(const int Ncvec, const v4sf *in, v4sf *ou
     const float xr3{ ci[0] - s*(ci[1]-ci[3])};      out[6] = VINSERT0(out[6], xr3);
     const float xi3{ ci[2] - s*(ci[1]+ci[3])};      out[7] = VINSERT0(out[7], xi3);
 
-    for(int k{1};k < dk;++k)
+    for(size_t k{1};k < dk;++k)
         pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
 }
 
@@ -1728,16 +1728,16 @@ ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4s
     *out++ = i3;
 }
 
-NEVER_INLINE(void) pffft_real_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+NEVER_INLINE(void) pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
 
     assert(in != out);
-    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+    const size_t dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
     /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
 
     std::array<float,SIMD_SZ> Xr, Xi;
-    for(size_t k{0};k < 4;++k)
+    for(size_t k{0};k < SIMD_SZ;++k)
     {
         Xr[k] = VEXTRACT0(in[4*k]);
         Xi[k] = VEXTRACT0(in[4*k + 1]);
@@ -1756,7 +1756,7 @@ NEVER_INLINE(void) pffft_real_preprocess(const int Ncvec, const v4sf *in, v4sf *
      * [ci2] [0   0   0   0   0  -2   0   2]
      * [ci3] [0  -s   0   s   0  -s   0  -s]
      */
-    for(int k{1};k < dk;++k)
+    for(size_t k{1};k < dk;++k)
         pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false);
 
     const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]};
@@ -1778,7 +1778,7 @@ void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *vout
     assert(scratch != nullptr);
     assert(voutput != scratch);
 
-    const int Ncvec{setup->Ncvec};
+    const size_t Ncvec{setup->Ncvec};
     const bool nf_odd{(setup->ifac[1]&1) != 0};
 
     v4sf *buff[2]{voutput, scratch};
@@ -1797,7 +1797,7 @@ void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *vout
         else
         {
             v4sf *tmp{buff[ib]};
-            for(int k=0; k < Ncvec; ++k)
+            for(size_t k=0; k < Ncvec; ++k)
                 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
 
             ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
@@ -1830,7 +1830,7 @@ void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *vout
         {
             pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e);
             ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, setup->ifac, +1.0f) == buff[1]);
-            for(int k{0};k < Ncvec;++k)
+            for(size_t k{0};k < Ncvec;++k)
                 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
         }
     }
@@ -1839,7 +1839,7 @@ void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *vout
     {
         /* extra copy required -- this situation should only happen when finput == foutput */
         assert(vinput==voutput);
-        for(int k{0};k < Ncvec;++k)
+        for(size_t k{0};k < Ncvec;++k)
         {
             v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
             voutput[2*k] = a; voutput[2*k+1] = b;
@@ -1853,15 +1853,15 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
 {
     assert(in != out);
 
-    const int N{setup->N}, Ncvec{setup->Ncvec};
+    const size_t N{setup->N}, Ncvec{setup->Ncvec};
     const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
     v4sf *vout{reinterpret_cast<v4sf*>(out)};
     if(setup->transform == PFFFT_REAL)
     {
-        const int dk{N/32};
+        const size_t dk{N/32};
         if(direction == PFFFT_FORWARD)
         {
-            for(int k{0};k < dk;++k)
+            for(size_t k{0};k < dk;++k)
             {
                 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
                 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
@@ -1871,7 +1871,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
         }
         else
         {
-            for(int k{0};k < dk;++k)
+            for(size_t k{0};k < dk;++k)
             {
                 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
                 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
@@ -1884,17 +1884,17 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
     {
         if(direction == PFFFT_FORWARD)
         {
-            for(int k{0};k < Ncvec;++k)
+            for(size_t k{0};k < Ncvec;++k)
             {
-                int kk{(k/4) + (k%4)*(Ncvec/4)};
+                size_t kk{(k/4) + (k%4)*(Ncvec/4)};
                 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
             }
         }
         else
         {
-            for(int k{0};k < Ncvec;++k)
+            for(size_t k{0};k < Ncvec;++k)
             {
-                int kk{(k/4) + (k%4)*(Ncvec/4)};
+                size_t kk{(k/4) + (k%4)*(Ncvec/4)};
                 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
             }
         }
@@ -1904,7 +1904,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
     float scaling)
 {
-    const int Ncvec{s->Ncvec};
+    const size_t Ncvec{s->Ncvec};
     const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
     const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
     v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
@@ -1942,7 +1942,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
      * optimizers?
      */
     const float *a_{a}, *b_{b}; float *ab_{ab};
-    int N{Ncvec};
+    size_t N{Ncvec};
     asm volatile("mov         r8, %2                  \n"
                 "vdup.f32    q15, %4                 \n"
                 "1:                                  \n"
@@ -1981,7 +1981,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 
     /* Default routine, works fine for non-arm cpus with current compilers. */
     const v4sf vscal{LD_PS1(scaling)};
-    for(int i{0};i < Ncvec;i += 2)
+    for(size_t i{0};i < Ncvec;i += 2)
     {
         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
@@ -2030,7 +2030,7 @@ namespace {
 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
     float *scratch, const pffft_direction_t direction, bool ordered)
 {
-    const int Ncvec{setup->Ncvec};
+    const size_t Ncvec{setup->Ncvec};
     const bool nf_odd{(setup->ifac[1]&1) != 0};
 
     assert(scratch != nullptr);
@@ -2073,7 +2073,7 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
     {
         // extra copy required -- this situation should happens only when finput == foutput
         assert(input==output);
-        for(int k{0};k < Ncvec;++k)
+        for(size_t k{0};k < Ncvec;++k)
         {
             float a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
             output[2*k] = a; output[2*k+1] = b;
@@ -2087,17 +2087,17 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
 void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
     pffft_direction_t direction)
 {
-    const int N{setup->N};
+    const size_t N{setup->N};
     if(setup->transform == PFFFT_COMPLEX)
     {
-        for(int k{0};k < 2*N;++k)
+        for(size_t k{0};k < 2*N;++k)
             out[k] = in[k];
         return;
     }
     else if(direction == PFFFT_FORWARD)
     {
         float x_N{in[N-1]};
-        for(int k{N-1};k > 1;--k)
+        for(size_t k{N-1};k > 1;--k)
             out[k] = in[k-1];
         out[0] = in[0];
         out[1] = x_N;
@@ -2105,7 +2105,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
     else
     {
         float x_N{in[1]};
-        for(int k{1};k < N-1;++k)
+        for(size_t k{1};k < N-1;++k)
             out[k] = in[k+1];
         out[0] = in[0];
         out[N-1] = x_N;
@@ -2116,7 +2116,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
 void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab,
     float scaling)
 {
-    int Ncvec = s->Ncvec;
+    size_t Ncvec{s->Ncvec};
 
     if(s->transform == PFFFT_REAL)
     {
@@ -2125,7 +2125,7 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo
         ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
         ++ab; ++a; ++b; --Ncvec;
     }
-    for(int i{0};i < Ncvec;++i)
+    for(size_t i{0};i < Ncvec;++i)
     {
         float ar{a[2*i+0]}, ai{a[2*i+1]};
         const float br{b[2*i+0]}, bi{b[2*i+1]};
diff --git a/common/pffft.h b/common/pffft.h
index 87d10216..3b402ca4 100644
--- a/common/pffft.h
+++ b/common/pffft.h
@@ -107,7 +107,7 @@ typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
  * structure is read-only so it can safely be shared by multiple concurrent
  * threads.
  */
-PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+PFFFT_Setup *pffft_new_setup(unsigned int N, pffft_transform_t transform);
 void pffft_destroy_setup(PFFFT_Setup *setup);
 
 /**
@@ -170,7 +170,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const fl
  * buffers.
  */
 void *pffft_aligned_malloc(size_t nb_bytes);
-void pffft_aligned_free(void *);
+void pffft_aligned_free(void *ptr);
 
 /* Return 4 or 1 depending if vectorization was enable when building pffft.cpp. */
 int pffft_simd_size();
-- 
cgit v1.2.3


From 9274190020659369ef0734d6314d9f89cce889b0 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Tue, 10 Oct 2023 07:01:51 -0700
Subject: Fix array lookup index

---
 common/pffft.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 1ec7514a..c01c8988 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -1421,6 +1421,7 @@ PFFFT_Setup *pffft_new_setup(unsigned int N, pffft_transform_t transform)
     if constexpr(SIMD_SZ > 1)
     {
         al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1));
+        std::fill(e.begin(), e.end(), 0.0f);
         for(size_t k{0};k < s->Ncvec;++k)
         {
             const size_t i{k / SIMD_SZ};
@@ -1739,8 +1740,8 @@ NEVER_INLINE(void) pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4s
     std::array<float,SIMD_SZ> Xr, Xi;
     for(size_t k{0};k < SIMD_SZ;++k)
     {
-        Xr[k] = VEXTRACT0(in[4*k]);
-        Xi[k] = VEXTRACT0(in[4*k + 1]);
+        Xr[k] = VEXTRACT0(in[2*k]);
+        Xi[k] = VEXTRACT0(in[2*k + 1]);
     }
 
     pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values
-- 
cgit v1.2.3


From ce25165944913c12b9b782e40691f3be1d18dadd Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Tue, 10 Oct 2023 16:33:09 -0700
Subject: Fix conversion warnings

---
 common/pffft.cpp | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index c01c8988..7e5ba5c3 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -1310,23 +1310,24 @@ void rffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
     size_t is{0};
     size_t nfm1{nf - 1};
     size_t l1{1};
-    for(size_t k1{1};k1 <= nfm1;++k1)
+    for(size_t k1{0};k1 < nfm1;++k1)
     {
-        const size_t ip{ifac[k1 + 1]};
+        const size_t ip{ifac[k1+2]};
         const size_t l2{l1*ip};
         const size_t ido{n / l2};
         const size_t ipm{ip - 1};
-        int ld{0};
-        for(size_t j{1};j <= ipm;++j)
+        size_t ld{0};
+        for(size_t j{0};j < ipm;++j)
         {
-            size_t i{is}, fi{0};
+            size_t i{is};
             ld += l1;
-            double argld{ld*argh};
-            for(size_t ii{3};ii <= ido;ii += 2)
+            const double argld{static_cast<double>(ld)*argh};
+            double fi{0.0};
+            for(size_t ii{2};ii < ido;ii += 2)
             {
-                fi += 1;
-                wa[i++] = static_cast<float>(std::cos(static_cast<double>(fi)*argld));
-                wa[i++] = static_cast<float>(std::sin(static_cast<double>(fi)*argld));
+                fi += 1.0;
+                wa[i++] = static_cast<float>(std::cos(fi*argld));
+                wa[i++] = static_cast<float>(std::sin(fi*argld));
             }
             is += ido;
         }
@@ -1342,26 +1343,27 @@ void cffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
     const double argh{2.0*al::numbers::pi / n};
     size_t i{1};
     size_t l1{1};
-    for(size_t k1{1};k1 <= nf;++k1)
+    for(size_t k1{0};k1 < nf;++k1)
     {
-        const size_t ip{ifac[k1+1]};
+        const size_t ip{ifac[k1+2]};
         const size_t l2{l1*ip};
         const size_t ido{n / l2};
         const size_t idot{ido + ido + 2};
         const size_t ipm{ip - 1};
         size_t ld{0};
-        for(size_t j{1};j <= ipm;++j)
+        for(size_t j{0};j < ipm;++j)
         {
-            size_t i1{i}, fi{0};
+            size_t i1{i};
             wa[i-1] = 1;
             wa[i] = 0;
             ld += l1;
-            const double argld{ld*argh};
-            for(size_t ii{4};ii <= idot;ii += 2)
+            const double argld{static_cast<double>(ld)*argh};
+            double fi{0.0};
+            for(size_t ii{3};ii < idot;ii += 2)
             {
-                fi += 1;
-                wa[++i] = static_cast<float>(std::cos(static_cast<double>(fi)*argld));
-                wa[++i] = static_cast<float>(std::sin(static_cast<double>(fi)*argld));
+                fi += 1.0;
+                wa[++i] = static_cast<float>(std::cos(fi*argld));
+                wa[++i] = static_cast<float>(std::sin(fi*argld));
             }
             if(ip > 5)
             {
-- 
cgit v1.2.3


From 5149cb8c357630dba5253e2568b68d2ed069bcea Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Wed, 11 Oct 2023 15:56:11 -0700
Subject: Make and use a separate zconvolve method without scaling

When you're doing hundreds or thousands of separate zconvolve calls into the
same buffer, it's more efficient to do the multiply once at the end instead of
in each call.
---
 alc/effects/convolution.cpp | 15 ++++-----
 common/pffft.cpp            | 79 +++++++++++++++++++++++++++++++++++++++++++--
 common/pffft.h              | 16 +++++++--
 3 files changed, 96 insertions(+), 14 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/alc/effects/convolution.cpp b/alc/effects/convolution.cpp
index c7a342dc..a98a0616 100644
--- a/alc/effects/convolution.cpp
+++ b/alc/effects/convolution.cpp
@@ -655,11 +655,6 @@ void ConvolutionState::process(const size_t samplesToDo,
         const float *RESTRICT filter{mComplexData.get() + mNumConvolveSegs*ConvolveUpdateSize};
         for(size_t c{0};c < chans.size();++c)
         {
-            /* The iFFT'd response is scaled up by the number of bins, so apply
-             * the inverse to normalize the output.
-             */
-            static constexpr float fftscale{1.0f / float{ConvolveUpdateSize}};
-
             /* Convolve each input segment with its IR filter counterpart
              * (aligned in time).
              */
@@ -667,14 +662,14 @@ void ConvolutionState::process(const size_t samplesToDo,
             const float *RESTRICT input{&mComplexData[curseg*ConvolveUpdateSize]};
             for(size_t s{curseg};s < mNumConvolveSegs;++s)
             {
-                pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data(), fftscale);
+                pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data());
                 input += ConvolveUpdateSize;
                 filter += ConvolveUpdateSize;
             }
             input = mComplexData.get();
             for(size_t s{0};s < curseg;++s)
             {
-                pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data(), fftscale);
+                pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data());
                 input += ConvolveUpdateSize;
                 filter += ConvolveUpdateSize;
             }
@@ -687,8 +682,12 @@ void ConvolutionState::process(const size_t samplesToDo,
             pffft_transform(mFft.get(), mFftBuffer.data(), mFftBuffer.data(),
                 mFftWorkBuffer.data(), PFFFT_BACKWARD);
 
+            /* The iFFT'd response is scaled up by the number of bins, so apply
+             * the inverse to normalize the output.
+             */
+            static constexpr float fftscale{1.0f / float{ConvolveUpdateSize}};
             for(size_t i{0};i < ConvolveUpdateSamples;++i)
-                mOutput[c][i] = mFftBuffer[i] + mOutput[c][ConvolveUpdateSamples+i];
+                mOutput[c][i] = (mFftBuffer[i]+mOutput[c][ConvolveUpdateSamples+i]) * fftscale;
             for(size_t i{0};i < ConvolveUpdateSamples;++i)
                 mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i];
         }
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 7e5ba5c3..f8568acf 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -1904,7 +1904,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
     }
 }
 
-void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
     float scaling)
 {
     const size_t Ncvec{s->Ncvec};
@@ -2006,6 +2006,59 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
     }
 }
 
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+{
+    const size_t Ncvec{s->Ncvec};
+    const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
+    const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
+    v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
+
+#ifdef __arm__
+    __builtin_prefetch(va);
+    __builtin_prefetch(vb);
+    __builtin_prefetch(vab);
+    __builtin_prefetch(va+2);
+    __builtin_prefetch(vb+2);
+    __builtin_prefetch(vab+2);
+    __builtin_prefetch(va+4);
+    __builtin_prefetch(vb+4);
+    __builtin_prefetch(vab+4);
+    __builtin_prefetch(va+6);
+    __builtin_prefetch(vb+6);
+    __builtin_prefetch(vab+6);
+#endif
+
+    const float ar1{VEXTRACT0(va[0])};
+    const float ai1{VEXTRACT0(va[1])};
+    const float br1{VEXTRACT0(vb[0])};
+    const float bi1{VEXTRACT0(vb[1])};
+    const float abr1{VEXTRACT0(vab[0])};
+    const float abi1{VEXTRACT0(vab[1])};
+
+    /* No inline assembly for this version. I'm not familiar enough with NEON
+     * assembly, and I don't know that it's needed with today's optimizers.
+     */
+    for(size_t i{0};i < Ncvec;i += 2)
+    {
+        v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
+        v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
+        VCPLXMUL(ar4, ai4, br4, bi4);
+        vab[2*i+0] = VADD(ar4, vab[2*i+0]);
+        vab[2*i+1] = VADD(ai4, vab[2*i+1]);
+        ar4 = va[2*i+2]; ai4 = va[2*i+3];
+        br4 = vb[2*i+2]; bi4 = vb[2*i+3];
+        VCPLXMUL(ar4, ai4, br4, bi4);
+        vab[2*i+2] = VADD(ar4, vab[2*i+2]);
+        vab[2*i+3] = VADD(ai4, vab[2*i+3]);
+    }
+
+    if(s->transform == PFFFT_REAL)
+    {
+        vab[0] = VINSERT0(vab[0], abr1 + ar1*br1);
+        vab[1] = VINSERT0(vab[1], abi1 + ai1*bi1);
+    }
+}
+
 
 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
 {
@@ -2115,8 +2168,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
     }
 }
 
-#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
-void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
     float scaling)
 {
     size_t Ncvec{s->Ncvec};
@@ -2138,6 +2190,27 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo
     }
 }
 
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+{
+    size_t Ncvec{s->Ncvec};
+
+    if(s->transform == PFFFT_REAL)
+    {
+        // take care of the fftpack ordering
+        ab[0] += a[0]*b[0];
+        ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1];
+        ++ab; ++a; ++b; --Ncvec;
+    }
+    for(size_t i{0};i < Ncvec;++i)
+    {
+        float ar{a[2*i+0]}, ai{a[2*i+1]};
+        const float br{b[2*i+0]}, bi{b[2*i+1]};
+        VCPLXMUL(ar, ai, br, bi);
+        ab[2*i+0] += ar;
+        ab[2*i+1] += ai;
+    }
+}
+
 
 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
 {
diff --git a/common/pffft.h b/common/pffft.h
index 3b402ca4..4bc3ebb6 100644
--- a/common/pffft.h
+++ b/common/pffft.h
@@ -152,8 +152,8 @@ void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *outp
 void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
 
 /**
- * Perform a multiplication of the z-domain data in dft_a and dft_b and
- * accumulate them into dft_ab. The arrays should have been obtained with
+ * Perform a multiplication of the z-domain data in dft_a and dft_b, and scale
+ * and accumulate into dft_ab. The arrays should have been obtained with
  * pffft_transform(..., PFFFT_FORWARD) or pffft_zreorder(..., PFFFT_BACKWARD)
  * and should *not* be in the usual order (otherwise just perform the operation
  * yourself as the dft coeffs are stored as interleaved complex numbers).
@@ -162,7 +162,17 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft
  *
  * The dft_a, dft_b, and dft_ab parameters may alias.
  */
-void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+/**
+ * Perform a multiplication of the z-domain data in dft_a and dft_b, and
+ * accumulate into dft_ab.
+ *
+ * The operation performed is: dft_ab += dft_a * dft_b
+ *
+ * The dft_a, dft_b, and dft_ab parameters may alias.
+ */
+void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab);
 
 /**
  * The float buffers must have the correct alignment (16-byte boundary on intel
-- 
cgit v1.2.3


From a0819843be860bff54b378eebc2277069041b373 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Thu, 12 Oct 2023 01:40:39 -0700
Subject: Don't use al::vector where not needed

---
 common/pffft.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index f8568acf..ea93dfc8 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -70,7 +70,6 @@
 #include "alnumbers.h"
 #include "alspan.h"
 #include "opthelpers.h"
-#include "vector.h"
 
 
 namespace {
@@ -1422,8 +1421,7 @@ PFFFT_Setup *pffft_new_setup(unsigned int N, pffft_transform_t transform)
 
     if constexpr(SIMD_SZ > 1)
     {
-        al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1));
-        std::fill(e.begin(), e.end(), 0.0f);
+        auto e = std::vector<float>(2u*Ncvec*(SIMD_SZ-1), 0.0f);
         for(size_t k{0};k < s->Ncvec;++k)
         {
             const size_t i{k / SIMD_SZ};
@@ -1431,8 +1429,8 @@ PFFFT_Setup *pffft_new_setup(unsigned int N, pffft_transform_t transform)
             for(size_t m{0};m < SIMD_SZ-1;++m)
             {
                 const double A{-2.0*al::numbers::pi*static_cast<double>((m+1)*k) / N};
-                e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
-                e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
+                e[((i*3 + m)*2 + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
+                e[((i*3 + m)*2 + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
             }
         }
         std::memcpy(s->e, e.data(), e.size()*sizeof(float));
-- 
cgit v1.2.3


From c5af96992543c76c249f1c67478733fff96fc952 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Mon, 16 Oct 2023 06:31:56 -0700
Subject: Constify some parameters

---
 common/pffft.cpp | 33 +++++++++++++++++++--------------
 common/pffft.h   | 10 +++++-----
 2 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index ea93dfc8..fd9f240a 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -1773,7 +1773,7 @@ NEVER_INLINE(void) pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4s
 }
 
 
-void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput,
+void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput,
     v4sf *scratch, const pffft_direction_t direction, const bool ordered)
 {
     assert(scratch != nullptr);
@@ -1850,7 +1850,8 @@ void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *vout
 
 } // namespace
 
-void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
+void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
+    pffft_direction_t direction)
 {
     assert(in != out);
 
@@ -1902,8 +1903,8 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
     }
 }
 
-void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
-    float scaling)
+void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
+    float *ab, float scaling)
 {
     const size_t Ncvec{s->Ncvec};
     const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
@@ -2004,7 +2005,7 @@ void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const floa
     }
 }
 
-void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
 {
     const size_t Ncvec{s->Ncvec};
     const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
@@ -2058,7 +2059,8 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 }
 
 
-void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
+    pffft_direction_t direction)
 {
     assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
@@ -2066,7 +2068,8 @@ void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, floa
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
 }
 
-void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
+    float *work, pffft_direction_t direction)
 {
     assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
@@ -2081,7 +2084,7 @@ void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *outp
 namespace {
 
 #define pffft_transform_internal_nosimd pffft_transform_internal
-void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
+void pffft_transform_internal_nosimd(const PFFFT_Setup *setup, const float *input, float *output,
     float *scratch, const pffft_direction_t direction, bool ordered)
 {
     const size_t Ncvec{setup->Ncvec};
@@ -2138,7 +2141,7 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
 } // namespace
 
 #define pffft_zreorder_nosimd pffft_zreorder
-void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
+void pffft_zreorder_nosimd(const PFFFT_Setup *setup, const float *in, float *out,
     pffft_direction_t direction)
 {
     const size_t N{setup->N};
@@ -2166,8 +2169,8 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
     }
 }
 
-void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
-    float scaling)
+void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
+    float *ab, float scaling)
 {
     size_t Ncvec{s->Ncvec};
 
@@ -2188,7 +2191,7 @@ void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const floa
     }
 }
 
-void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
 {
     size_t Ncvec{s->Ncvec};
 
@@ -2210,12 +2213,14 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 }
 
 
-void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
+    pffft_direction_t direction)
 {
     pffft_transform_internal(setup, input, output, work, direction, false);
 }
 
-void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
+    float *work, pffft_direction_t direction)
 {
     pffft_transform_internal(setup, input, output, work, direction, true);
 }
diff --git a/common/pffft.h b/common/pffft.h
index 4bc3ebb6..9cff9e54 100644
--- a/common/pffft.h
+++ b/common/pffft.h
@@ -126,7 +126,7 @@ void pffft_destroy_setup(PFFFT_Setup *setup);
  *
  * The input and output parameters may alias.
  */
-void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
 
 /**
  * Similar to pffft_transform, but handles the complex values in the usual form
@@ -138,7 +138,7 @@ void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, floa
  *
  * The input and output parameters may alias.
  */
-void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
 
 /**
  * Reorder the z-domain data. For PFFFT_FORWARD, it reorders from the internal
@@ -149,7 +149,7 @@ void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *outp
  *
  * The input and output parameters should not alias.
  */
-void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+void pffft_zreorder(const PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
 
 /**
  * Perform a multiplication of the z-domain data in dft_a and dft_b, and scale
@@ -162,7 +162,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft
  *
  * The dft_a, dft_b, and dft_ab parameters may alias.
  */
-void pffft_zconvolve_scale_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
 
 /**
  * Perform a multiplication of the z-domain data in dft_a and dft_b, and
@@ -172,7 +172,7 @@ void pffft_zconvolve_scale_accumulate(PFFFT_Setup *setup, const float *dft_a, co
  *
  * The dft_a, dft_b, and dft_ab parameters may alias.
  */
-void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab);
+void pffft_zconvolve_accumulate(const PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab);
 
 /**
  * The float buffers must have the correct alignment (16-byte boundary on intel
-- 
cgit v1.2.3


From 50fce82c4043d989a6868b13a6930fa31b0cc420 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Mon, 16 Oct 2023 07:12:30 -0700
Subject: Avoid some macros that use inputs multiple times

---
 common/pffft.cpp | 82 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 25 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index fd9f240a..80d4e9c7 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -112,10 +112,10 @@ typedef vector float v4sf;
 #define SIMD_SZ 4
 #define VZERO() ((vector float) vec_splat_u8(0))
 #define VMUL(a,b) vec_madd(a,b, VZERO())
-#define VADD(a,b) vec_add(a,b)
-#define VMADD(a,b,c) vec_madd(a,b,c)
-#define VSUB(a,b) vec_sub(a,b)
-#define LD_PS1(p) vec_splats(p)
+#define VADD vec_add
+#define VMADD vec_madd
+#define VSUB vec_sub
+#define LD_PS1 vec_splats
 inline v4sf vset4(float a, float b, float c, float d)
 {
     /* There a more efficient way to do this? */
@@ -125,12 +125,20 @@ inline v4sf vset4(float a, float b, float c, float d)
 #define VSET4 vset4
 #define VINSERT0(v, a) vec_insert((a), (v), 0)
 #define VEXTRACT0(v) vec_extract((v), 0)
-#define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } while(0)
-#define UNINTERLEAVE2(in1, in2, out1, out2) do {                           \
-    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
-    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
-    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
-} while(0)
+ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    v4sf tmp{vec_mergeh(in1, in2)};
+    out2 = vec_mergel(in1, in2);
+    out1 = tmp;
+}
+#define INTERLEAVE2 interleave2
+ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    v4sf tmp{vec_perm(in1, in2, (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27))};
+    out2 = vec_perm(in1, in2, (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31));
+    out1 = tmp;
+}
+#define UNINTERLEAVE2 uninterleave2
 #define VTRANSPOSE4(x0,x1,x2,x3) do {           \
     v4sf y0 = vec_mergeh(x0, x2);               \
     v4sf y1 = vec_mergel(x0, x2);               \
@@ -162,8 +170,20 @@ typedef __m128 v4sf;
 #define VSET4 _mm_setr_ps
 #define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a))
 #define VEXTRACT0 _mm_cvtss_f32
-#define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } while(0)
-#define UNINTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } while(0)
+ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    v4sf tmp{_mm_unpacklo_ps(in1, in2)};
+    out2 = _mm_unpackhi_ps(in1, in2);
+    out1 = tmp;
+}
+#define INTERLEAVE2 interleave2
+ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    v4sf tmp{_mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0))};
+    out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
+    out1 = tmp;
+}
+#define UNINTERLEAVE2 uninterleave2
 #define VTRANSPOSE4 _MM_TRANSPOSE4_PS
 #define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
 #define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
@@ -182,7 +202,7 @@ typedef float32x4_t v4sf;
 #define VMADD(a,b,c) vmlaq_f32(c,a,b)
 #define VSUB vsubq_f32
 #define LD_PS1 vdupq_n_f32
-inline v4sf vset4(float a, float b, float c, float d)
+ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
 {
     float32x4_t ret{vmovq_n_f32(a)};
     ret = vsetq_lane_f32(b, ret, 1);
@@ -220,20 +240,20 @@ using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
 #define VMADD(a,b,c) ((a)*(b) + (c))
 #define VSUB(a,b) ((a) - (b))
 
-constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
+constexpr ALWAYS_INLINE(v4sf) ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 #define LD_PS1 ld_ps1
 #define VSET4(a, b, c, d) v4sf{(a), (b), (c), (d)}
-constexpr v4sf vinsert0(v4sf v, float a) noexcept
+constexpr ALWAYS_INLINE(v4sf) vinsert0(v4sf v, float a) noexcept
 { return v4sf{a, v[1], v[2], v[3]}; }
 #define VINSERT0 vinsert0
 #define VEXTRACT0(v) ((v)[0])
 
-[[gnu::always_inline]] inline v4sf unpacklo(v4sf a, v4sf b) noexcept
+ALWAYS_INLINE(v4sf) unpacklo(v4sf a, v4sf b) noexcept
 { return v4sf{a[0], b[0], a[1], b[1]}; }
-[[gnu::always_inline]] inline v4sf unpackhi(v4sf a, v4sf b) noexcept
+ALWAYS_INLINE(v4sf) unpackhi(v4sf a, v4sf b) noexcept
 { return v4sf{a[2], b[2], a[3], b[3]}; }
 
-[[gnu::always_inline]] inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp__{unpacklo(in1, in2)};
     out2 = unpackhi(in1, in2);
@@ -241,7 +261,7 @@ constexpr v4sf vinsert0(v4sf v, float a) noexcept
 }
 #define INTERLEAVE2 interleave2
 
-[[gnu::always_inline]] inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp__{in1[0], in1[2], in2[0], in2[2]};
     out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
@@ -249,7 +269,7 @@ constexpr v4sf vinsert0(v4sf v, float a) noexcept
 }
 #define UNINTERLEAVE2 uninterleave2
 
-[[gnu::always_inline]] inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 {
     v4sf tmp0{unpacklo(x0, x1)};
     v4sf tmp2{unpacklo(x2, x3)};
@@ -262,7 +282,7 @@ constexpr v4sf vinsert0(v4sf v, float a) noexcept
 }
 #define VTRANSPOSE4 vtranspose4
 
-[[gnu::always_inline]] inline v4sf vswaphl(v4sf a, v4sf b) noexcept
+ALWAYS_INLINE(v4sf) vswaphl(v4sf a, v4sf b) noexcept
 { return v4sf{b[0], b[1], a[2], a[3]}; }
 #define VSWAPHL vswaphl
 
@@ -290,8 +310,21 @@ typedef float v4sf;
 #endif
 
 // shortcuts for complex multiplications
-#define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0)
-#define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0)
+ALWAYS_INLINE(void) vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
+{
+    v4sf tmp{VMUL(ar, bi)};
+    ar = VSUB(VMUL(ar, br), VMUL(ai, bi));
+    ai = VMADD(ai, br, tmp);
+}
+#define VCPLXMUL vcplxmul
+
+ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
+{
+    v4sf tmp{VMUL(ar, bi)};
+    ar = VMADD(ai, bi, VMUL(ar, br));
+    ai = VSUB(VMUL(ai, br), tmp);
+}
+#define VCPLXMULCONJ vcplxmulconj
 
 #if !defined(PFFFT_SIMD_DISABLE)
 
@@ -366,14 +399,13 @@ NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc,
     }
     else
     {
-        const v4sf vsign{LD_PS1(fsign)};
         for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
         {
             for(size_t i{0};i < ido-1;i += 2)
             {
                 v4sf tr2{VSUB(cc[i+0], cc[i+ido+0])};
                 v4sf ti2{VSUB(cc[i+1], cc[i+ido+1])};
-                v4sf wr{LD_PS1(wa1[i])}, wi{VMUL(vsign, LD_PS1(wa1[i+1]))};
+                v4sf wr{LD_PS1(wa1[i])}, wi{LD_PS1(wa1[i+1]*fsign)};
                 ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
                 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
                 VCPLXMUL(tr2, ti2, wr, wi);
-- 
cgit v1.2.3


From a82c5373667aae8f9e87b9d87ef9d2dec625f2fb Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Mon, 16 Oct 2023 09:01:41 -0700
Subject: Replace some function-like macros with real functions

---
 common/pffft.cpp | 253 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 136 insertions(+), 117 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 80d4e9c7..5a6bb4db 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -116,7 +116,7 @@ typedef vector float v4sf;
 #define VMADD vec_madd
 #define VSUB vec_sub
 #define LD_PS1 vec_splats
-inline v4sf vset4(float a, float b, float c, float d)
+ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
 {
     /* There a more efficient way to do this? */
     alignas(16) std::array<float,4> vals{{a, b, c, d}};
@@ -125,32 +125,33 @@ inline v4sf vset4(float a, float b, float c, float d)
 #define VSET4 vset4
 #define VINSERT0(v, a) vec_insert((a), (v), 0)
 #define VEXTRACT0(v) vec_extract((v), 0)
+
 ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{vec_mergeh(in1, in2)};
     out2 = vec_mergel(in1, in2);
     out1 = tmp;
 }
-#define INTERLEAVE2 interleave2
 ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{vec_perm(in1, in2, (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27))};
     out2 = vec_perm(in1, in2, (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31));
     out1 = tmp;
 }
-#define UNINTERLEAVE2 uninterleave2
-#define VTRANSPOSE4(x0,x1,x2,x3) do {           \
-    v4sf y0 = vec_mergeh(x0, x2);               \
-    v4sf y1 = vec_mergel(x0, x2);               \
-    v4sf y2 = vec_mergeh(x1, x3);               \
-    v4sf y3 = vec_mergel(x1, x3);               \
-    x0 = vec_mergeh(y0, y2);                    \
-    x1 = vec_mergel(y0, y2);                    \
-    x2 = vec_mergeh(y1, y3);                    \
-    x3 = vec_mergel(y1, y3);                    \
-} while(0)
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{
+    v4sf y0{vec_mergeh(x0, x2)};
+    v4sf y1{vec_mergel(x0, x2)};
+    v4sf y2{vec_mergeh(x1, x3)};
+    v4sf y3{vec_mergel(x1, x3)};
+    x0 = vec_mergeh(y0, y2);
+    x1 = vec_mergel(y0, y2);
+    x2 = vec_mergeh(y1, y3);
+    x3 = vec_mergel(y1, y3);
+}
+
 #define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
 
 /*
  * SSE1 support macros
@@ -170,23 +171,24 @@ typedef __m128 v4sf;
 #define VSET4 _mm_setr_ps
 #define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a))
 #define VEXTRACT0 _mm_cvtss_f32
+
 ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{_mm_unpacklo_ps(in1, in2)};
     out2 = _mm_unpackhi_ps(in1, in2);
     out1 = tmp;
 }
-#define INTERLEAVE2 interleave2
 ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{_mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0))};
     out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
     out1 = tmp;
 }
-#define UNINTERLEAVE2 uninterleave2
-#define VTRANSPOSE4 _MM_TRANSPOSE4_PS
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{ _MM_TRANSPOSE4_PS(x0, x1, x2, x3); }
+
 #define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
 
 /*
  * ARM NEON support macros
@@ -213,19 +215,40 @@ ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
 #define VSET4 vset4
 #define VINSERT0(v, a) vsetq_lane_f32((a), (v), 0)
 #define VEXTRACT0(v) vgetq_lane_f32((v), 0)
-#define INTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
-#define UNINTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
-#define VTRANSPOSE4(x0,x1,x2,x3) do {                                   \
-    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
-    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
-    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
-    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
-    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
-} while(0)
-// marginally faster version
-//#define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+
+ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    float32x4x2_t tmp{vzipq_f32(in1, in2)};
+    out1 = tmp.val[0];
+    out2 = tmp.val[1];
+}
+ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    float32x4x2_t tmp{vuzpq_f32(in1, in2)};
+    out1 = tmp.val[0];
+    out2 = tmp.val[1];
+}
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{
+    /* marginally faster version:
+     * asm("vtrn.32 %q0, %q1;\n"
+     *     "vtrn.32 %q2, %q3\n
+     *     "vswp %f0, %e2\n
+     *     "vswp %f1, %e3"
+     *     : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
+     */
+    float32x4x2_t t0_{vzipq_f32(x0, x2)};
+    float32x4x2_t t1_{vzipq_f32(x1, x3)};
+    float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
+    float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
+    x0 = u0_.val[0];
+    x1 = u0_.val[1];
+    x2 = u1_.val[0];
+    x3 = u1_.val[1];
+}
+
 #define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
 
 /*
  * Generic GCC vector macros
@@ -255,19 +278,16 @@ ALWAYS_INLINE(v4sf) unpackhi(v4sf a, v4sf b) noexcept
 
 ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
-    v4sf tmp__{unpacklo(in1, in2)};
+    v4sf tmp{unpacklo(in1, in2)};
     out2 = unpackhi(in1, in2);
-    out1 = tmp__;
+    out1 = tmp;
 }
-#define INTERLEAVE2 interleave2
-
 ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
-    v4sf tmp__{in1[0], in1[2], in2[0], in2[2]};
+    v4sf tmp{in1[0], in1[2], in2[0], in2[2]};
     out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
-    out1 = tmp__;
+    out1 = tmp;
 }
-#define UNINTERLEAVE2 uninterleave2
 
 ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 {
@@ -280,14 +300,11 @@ ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
     x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
     x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
 }
-#define VTRANSPOSE4 vtranspose4
 
 ALWAYS_INLINE(v4sf) vswaphl(v4sf a, v4sf b) noexcept
 { return v4sf{b[0], b[1], a[2], a[3]}; }
 #define VSWAPHL vswaphl
 
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
-
 #else
 
 #warning "building with simd disabled !\n";
@@ -306,9 +323,14 @@ typedef float v4sf;
 #define VMADD(a,b,c) ((a)*(b)+(c))
 #define VSUB(a,b) ((a)-(b))
 #define LD_PS1(p) (p)
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
 #endif
 
+inline bool valigned(const float *ptr) noexcept
+{
+    static constexpr uintptr_t alignmask{SIMD_SZ*4 - 1};
+    return (reinterpret_cast<uintptr_t>(ptr) & alignmask) == 0;
+}
+
 // shortcuts for complex multiplications
 ALWAYS_INLINE(void) vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 {
@@ -316,15 +338,12 @@ ALWAYS_INLINE(void) vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
     ar = VSUB(VMUL(ar, br), VMUL(ai, bi));
     ai = VMADD(ai, br, tmp);
 }
-#define VCPLXMUL vcplxmul
-
 ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 {
     v4sf tmp{VMUL(ar, bi)};
     ar = VMADD(ai, bi, VMUL(ar, br));
     ai = VSUB(VMUL(ai, br), tmp);
 }
-#define VCPLXMULCONJ vcplxmulconj
 
 #if !defined(PFFFT_SIMD_DISABLE)
 
@@ -352,10 +371,10 @@ ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
     t_v = VMADD(a1_v, a2_v,a0_v); t_f = al::bit_cast<float4>(t_v);
     printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 32, 46, 62, 80);
 
-    INTERLEAVE2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
+    interleave2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
     printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
     assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
-    UNINTERLEAVE2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
+    uninterleave2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
     printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
     assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
 
@@ -365,7 +384,7 @@ ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
     t_v = VSWAPHL(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
     printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
     assertv4(t, 8, 9, 6, 7);
-    VTRANSPOSE4(a0_v, a1_v, a2_v, a3_v);
+    vtranspose4(a0_v, a1_v, a2_v, a3_v);
     a0_f = al::bit_cast<float4>(a0_v);
     a1_f = al::bit_cast<float4>(a1_v);
     a2_f = al::bit_cast<float4>(a2_v);
@@ -408,7 +427,7 @@ NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc,
                 v4sf wr{LD_PS1(wa1[i])}, wi{LD_PS1(wa1[i+1]*fsign)};
                 ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
                 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
-                VCPLXMUL(tr2, ti2, wr, wi);
+                vcplxmul(tr2, ti2, wr, wi);
                 ch[i+l1ido]   = tr2;
                 ch[i+l1ido+1] = ti2;
             }
@@ -444,10 +463,10 @@ NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc,
             v4sf di2{VADD(ci2, cr3)};
             v4sf di3{VSUB(ci2, cr3)};
             float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
-            VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+            vcplxmul(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
             ch[i+l1ido] = dr2;
             ch[i+l1ido + 1] = di2;
-            VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+            vcplxmul(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
             ch[i+2*l1ido] = dr3;
             ch[i+2*l1ido+1] = di3;
         }
@@ -508,17 +527,17 @@ NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc,
                 v4sf ci2{VADD(ti1, ti4)};
                 v4sf ci4{VSUB(ti1, ti4)};
                 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
-                VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+                vcplxmul(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
                 float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
                 ch[i + l1ido] = cr2;
                 ch[i + l1ido + 1] = ci2;
 
-                VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+                vcplxmul(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
                 float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
                 ch[i + 2*l1ido] = cr3;
                 ch[i + 2*l1ido + 1] = ci3;
 
-                VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+                vcplxmul(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
                 ch[i + 3*l1ido] = cr4;
                 ch[i + 3*l1ido + 1] = ci4;
             }
@@ -573,16 +592,16 @@ NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc,
             v4sf di2{VADD(ci2, cr5)};
             float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
             float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
-            VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+            vcplxmul(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
             ch_ref(i - 1, 2) = dr2;
             ch_ref(i, 2)     = di2;
-            VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+            vcplxmul(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
             ch_ref(i - 1, 3) = dr3;
             ch_ref(i, 3)     = di3;
-            VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+            vcplxmul(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
             ch_ref(i - 1, 4) = dr4;
             ch_ref(i, 4)     = di4;
-            VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+            vcplxmul(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
             ch_ref(i - 1, 5) = dr5;
             ch_ref(i, 5)     = di5;
         }
@@ -611,7 +630,7 @@ NEVER_INLINE(void) radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTR
             {
                 v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
                 v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
-                VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+                vcplxmulconj(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
                 ch[i + 2*k] = VADD(bi, ti2);
                 ch[2*(k+ido) - i] = VSUB(ti2, bi);
                 ch[i - 1 + 2*k] = VADD(br, tr2);
@@ -657,7 +676,7 @@ NEVER_INLINE(void) radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v
                 v4sf tr2{VSUB(a, b)};
                 ch[i+0 + k] = VSUB(c, d);
                 v4sf ti2{VADD(c, d)};
-                VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+                vcplxmul(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
                 ch[i-1 + k + l1ido] = tr2;
                 ch[i+0 + k + l1ido] = ti2;
             }
@@ -698,13 +717,13 @@ void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf wi1{LD_PS1(wa1[i - 1])};
             v4sf dr2{cc[i - 1 + (k + l1)*ido]};
             v4sf di2{cc[i + (k + l1)*ido]};
-            VCPLXMULCONJ(dr2, di2, wr1, wi1);
+            vcplxmulconj(dr2, di2, wr1, wi1);
 
             v4sf wr2{LD_PS1(wa2[i - 2])};
             v4sf wi2{LD_PS1(wa2[i - 1])};
             v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
             v4sf di3{cc[i + (k + l1*2)*ido]};
-            VCPLXMULCONJ(dr3, di3, wr2, wi2);
+            vcplxmulconj(dr3, di3, wr2, wi2);
 
             v4sf cr2{VADD(dr2, dr3)};
             v4sf ci2{VADD(di2, di3)};
@@ -762,10 +781,10 @@ void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf dr3{VADD(cr2, ci3)};
             v4sf di2{VADD(ci2, cr3)};
             v4sf di3{VSUB(ci2, cr3)};
-            VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+            vcplxmul(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
             ch[i - 1 + (k + l1)*ido] = dr2;
             ch[i + (k + l1)*ido] = di2;
-            VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+            vcplxmul(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
             ch[i - 1 + (k + 2*l1)*ido] = dr3;
             ch[i + (k + 2*l1)*ido] = di3;
         }
@@ -811,19 +830,19 @@ NEVER_INLINE(void) radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
                 v4sf ci2{pc[1*l1ido+1]};
                 v4sf wr{LD_PS1(wa1[i - 2])};
                 v4sf wi{LD_PS1(wa1[i - 1])};
-                VCPLXMULCONJ(cr2,ci2,wr,wi);
+                vcplxmulconj(cr2,ci2,wr,wi);
 
                 v4sf cr3{pc[2*l1ido+0]};
                 v4sf ci3{pc[2*l1ido+1]};
                 wr = LD_PS1(wa2[i-2]);
                 wi = LD_PS1(wa2[i-1]);
-                VCPLXMULCONJ(cr3, ci3, wr, wi);
+                vcplxmulconj(cr3, ci3, wr, wi);
 
                 v4sf cr4{pc[3*l1ido]};
                 v4sf ci4{pc[3*l1ido+1]};
                 wr = LD_PS1(wa3[i-2]);
                 wi = LD_PS1(wa3[i-1]);
-                VCPLXMULCONJ(cr4, ci4, wr, wi);
+                vcplxmulconj(cr4, ci4, wr, wi);
 
                 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
 
@@ -918,13 +937,13 @@ NEVER_INLINE(void) radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
                 v4sf ci3{VSUB(ti2, ti3)};
                 v4sf ci2{VADD(ti1, ti4)};
                 v4sf ci4{VSUB(ti1, ti4)};
-                VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+                vcplxmul(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
                 ph[0] = cr2;
                 ph[1] = ci2; ph += l1ido;
-                VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+                vcplxmul(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
                 ph[0] = cr3;
                 ph[1] = ci3; ph += l1ido;
-                VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
+                vcplxmul(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
                 ph[0] = cr4;
                 ph[1] = ci4; ph = ph - 3*l1ido + 2;
             }
@@ -997,10 +1016,10 @@ void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf di4{LD_PS1(wa3[i-2])};
             v4sf dr5{LD_PS1(wa4[i-3])};
             v4sf di5{LD_PS1(wa4[i-2])};
-            VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
-            VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
-            VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
-            VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+            vcplxmulconj(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+            vcplxmulconj(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+            vcplxmulconj(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+            vcplxmulconj(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
             v4sf cr2{VADD(dr2, dr5)};
             v4sf ci5{VSUB(dr5, dr2)};
             v4sf cr5{VSUB(di2, di5)};
@@ -1102,10 +1121,10 @@ void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf dr2{VSUB(cr2, ci5)};
             v4sf di5{VSUB(ci2, cr5)};
             v4sf di2{VADD(ci2, cr5)};
-            VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
-            VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
-            VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
-            VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
+            vcplxmul(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+            vcplxmul(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+            vcplxmul(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+            vcplxmul(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
 
             ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
             ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
@@ -1501,14 +1520,14 @@ namespace {
 void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *out)
 {
     v4sf g0, g1;
-    INTERLEAVE2(in[0], in[1], g0, g1);
+    interleave2(in[0], in[1], g0, g1);
     in += in_stride;
 
     *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
     for(size_t k{1};k < N;++k)
     {
         v4sf h0, h1;
-        INTERLEAVE2(in[0], in[1], h0, h1);
+        interleave2(in[0], in[1], h0, h1);
         in += in_stride;
         *--out = VSWAPHL(g1, h0);
         *--out = VSWAPHL(h0, h1);
@@ -1526,14 +1545,14 @@ void unreversed_copy(const size_t N, const v4sf *in, v4sf *out, const int out_st
         v4sf h0{*in++}; v4sf h1{*in++};
         g1 = VSWAPHL(g1, h0);
         h0 = VSWAPHL(h0, h1);
-        UNINTERLEAVE2(h0, g1, out[0], out[1]);
+        uninterleave2(h0, g1, out[0], out[1]);
         out += out_stride;
         g1 = h1;
     }
     v4sf h0{*in++}, h1{g0};
     g1 = VSWAPHL(g1, h0);
     h0 = VSWAPHL(h0, h1);
-    UNINTERLEAVE2(h0, g1, out[0], out[1]);
+    uninterleave2(h0, g1, out[0], out[1]);
 }
 
 void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
@@ -1547,11 +1566,11 @@ void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4
         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
         v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
         v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
-        VTRANSPOSE4(r0,r1,r2,r3);
-        VTRANSPOSE4(i0,i1,i2,i3);
-        VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
-        VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
-        VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
+        vtranspose4(r0,r1,r2,r3);
+        vtranspose4(i0,i1,i2,i3);
+        vcplxmul(r1,i1,e[k*6+0],e[k*6+1]);
+        vcplxmul(r2,i2,e[k*6+2],e[k*6+3]);
+        vcplxmul(r3,i3,e[k*6+4],e[k*6+5]);
 
         v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
         v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
@@ -1602,12 +1621,12 @@ void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *out, const
         r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
         r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
 
-        VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
-        VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
-        VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
+        vcplxmulconj(r1,i1,e[k*6+0],e[k*6+1]);
+        vcplxmulconj(r2,i2,e[k*6+2],e[k*6+3]);
+        vcplxmulconj(r3,i3,e[k*6+4],e[k*6+5]);
 
-        VTRANSPOSE4(r0,r1,r2,r3);
-        VTRANSPOSE4(i0,i1,i2,i3);
+        vtranspose4(r0,r1,r2,r3);
+        vtranspose4(i0,i1,i2,i3);
 
         *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
         *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
@@ -1622,8 +1641,8 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
     v4sf r1{*in++}; v4sf i1{*in++};
     v4sf r2{*in++}; v4sf i2{*in++};
     v4sf r3{*in++}; v4sf i3{*in++};
-    VTRANSPOSE4(r0,r1,r2,r3);
-    VTRANSPOSE4(i0,i1,i2,i3);
+    vtranspose4(r0,r1,r2,r3);
+    vtranspose4(i0,i1,i2,i3);
 
     /* transformation for each column is:
      *
@@ -1640,9 +1659,9 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
     //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
     //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
 
-    VCPLXMUL(r1,i1,e[0],e[1]);
-    VCPLXMUL(r2,i2,e[2],e[3]);
-    VCPLXMUL(r3,i3,e[4],e[5]);
+    vcplxmul(r1,i1,e[0],e[1]);
+    vcplxmul(r2,i2,e[2],e[3]);
+    vcplxmul(r3,i3,e[4],e[5]);
 
     //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
     //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
@@ -1741,12 +1760,12 @@ ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4s
     i1 = VSUB(si0, dr1);
     i3 = VADD(si0, dr1);
 
-    VCPLXMULCONJ(r1,i1,e[0],e[1]);
-    VCPLXMULCONJ(r2,i2,e[2],e[3]);
-    VCPLXMULCONJ(r3,i3,e[4],e[5]);
+    vcplxmulconj(r1,i1,e[0],e[1]);
+    vcplxmulconj(r2,i2,e[2],e[3]);
+    vcplxmulconj(r3,i3,e[4],e[5]);
 
-    VTRANSPOSE4(r0,r1,r2,r3);
-    VTRANSPOSE4(i0,i1,i2,i3);
+    vtranspose4(r0,r1,r2,r3);
+    vtranspose4(i0,i1,i2,i3);
 
     if(!first)
     {
@@ -1831,7 +1850,7 @@ void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf
         {
             v4sf *tmp{buff[ib]};
             for(size_t k=0; k < Ncvec; ++k)
-                UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
+                uninterleave2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
 
             ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
             pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
@@ -1864,7 +1883,7 @@ void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf
             pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e);
             ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, setup->ifac, +1.0f) == buff[1]);
             for(size_t k{0};k < Ncvec;++k)
-                INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
+                interleave2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
         }
     }
 
@@ -1897,8 +1916,8 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
         {
             for(size_t k{0};k < dk;++k)
             {
-                INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
-                INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+                interleave2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+                interleave2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
             }
             reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2);
             reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ);
@@ -1907,8 +1926,8 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
         {
             for(size_t k{0};k < dk;++k)
             {
-                UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
-                UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+                uninterleave2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+                uninterleave2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
             }
             unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8);
             unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8);
@@ -1921,7 +1940,7 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
             for(size_t k{0};k < Ncvec;++k)
             {
                 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
-                INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+                interleave2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
             }
         }
         else
@@ -1929,7 +1948,7 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
             for(size_t k{0};k < Ncvec;++k)
             {
                 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
-                UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+                uninterleave2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
             }
         }
     }
@@ -2019,12 +2038,12 @@ void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, cons
     {
         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
-        VCPLXMUL(ar4, ai4, br4, bi4);
+        vcplxmul(ar4, ai4, br4, bi4);
         vab[2*i+0] = VMADD(ar4, vscal, vab[2*i+0]);
         vab[2*i+1] = VMADD(ai4, vscal, vab[2*i+1]);
         ar4 = va[2*i+2]; ai4 = va[2*i+3];
         br4 = vb[2*i+2]; bi4 = vb[2*i+3];
-        VCPLXMUL(ar4, ai4, br4, bi4);
+        vcplxmul(ar4, ai4, br4, bi4);
         vab[2*i+2] = VMADD(ar4, vscal, vab[2*i+2]);
         vab[2*i+3] = VMADD(ai4, vscal, vab[2*i+3]);
     }
@@ -2073,12 +2092,12 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
     {
         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
-        VCPLXMUL(ar4, ai4, br4, bi4);
+        vcplxmul(ar4, ai4, br4, bi4);
         vab[2*i+0] = VADD(ar4, vab[2*i+0]);
         vab[2*i+1] = VADD(ai4, vab[2*i+1]);
         ar4 = va[2*i+2]; ai4 = va[2*i+3];
         br4 = vb[2*i+2]; bi4 = vb[2*i+3];
-        VCPLXMUL(ar4, ai4, br4, bi4);
+        vcplxmul(ar4, ai4, br4, bi4);
         vab[2*i+2] = VADD(ar4, vab[2*i+2]);
         vab[2*i+3] = VADD(ai4, vab[2*i+3]);
     }
@@ -2094,7 +2113,7 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
     pffft_direction_t direction)
 {
-    assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+    assert(valigned(input) && valigned(output) && valigned(work));
     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
@@ -2103,7 +2122,7 @@ void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output
 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
     float *work, pffft_direction_t direction)
 {
-    assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+    assert(valigned(input) && valigned(output) && valigned(work));
     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
@@ -2217,7 +2236,7 @@ void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, cons
     {
         float ar{a[2*i+0]}, ai{a[2*i+1]};
         const float br{b[2*i+0]}, bi{b[2*i+1]};
-        VCPLXMUL(ar, ai, br, bi);
+        vcplxmul(ar, ai, br, bi);
         ab[2*i+0] += ar*scaling;
         ab[2*i+1] += ai*scaling;
     }
@@ -2238,7 +2257,7 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
     {
         float ar{a[2*i+0]}, ai{a[2*i+1]};
         const float br{b[2*i+0]}, bi{b[2*i+1]};
-        VCPLXMUL(ar, ai, br, bi);
+        vcplxmul(ar, ai, br, bi);
         ab[2*i+0] += ar;
         ab[2*i+1] += ai;
     }
-- 
cgit v1.2.3


From 29bf9a50d46e18f03e940cc6e8e6dd8b61475a3a Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Mon, 16 Oct 2023 16:16:44 -0700
Subject: Mark some output buffer pointers as RESTRICT

---
 common/pffft.cpp | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 5a6bb4db..7390a777 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -402,7 +402,7 @@ ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 /*
   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
 */
-NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1, const float fsign)
 {
     const size_t l1ido{l1*ido};
@@ -438,7 +438,7 @@ NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc,
 /*
   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
 */
-NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float fsign)
 {
     assert(ido > 2);
@@ -473,7 +473,7 @@ NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc,
     }
 } /* passf3 */
 
-NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float *wa3, const float fsign)
 {
     /* fsign == -1 for forward transform and +1 for backward transform */
@@ -548,7 +548,7 @@ NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc,
 /*
  * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
  */
-NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float *wa3, const float *wa4, const float fsign)
 {
     const v4sf tr11{LD_PS1(0.309016994374947f)};
@@ -649,7 +649,7 @@ NEVER_INLINE(void) radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTR
 } /* radf2 */
 
 
-NEVER_INLINE(void) radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *ch,
+NEVER_INLINE(void) radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1)
 {
     const size_t l1ido{l1*ido};
@@ -1517,7 +1517,7 @@ void pffft_destroy_setup(PFFFT_Setup *s)
 namespace {
 
 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
-void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *out)
+void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *RESTRICT out)
 {
     v4sf g0, g1;
     interleave2(in[0], in[1], g0, g1);
@@ -1536,7 +1536,7 @@ void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *ou
     *--out = VSWAPHL(g1, g0);
 }
 
-void unreversed_copy(const size_t N, const v4sf *in, v4sf *out, const int out_stride)
+void unreversed_copy(const size_t N, const v4sf *in, v4sf *RESTRICT out, const int out_stride)
 {
     v4sf g0{in[0]}, g1{g0};
     ++in;
@@ -1555,7 +1555,7 @@ void unreversed_copy(const size_t N, const v4sf *in, v4sf *out, const int out_st
     uninterleave2(h0, g1, out[0], out[1]);
 }
 
-void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
 {
     assert(in != out);
 
@@ -1599,7 +1599,7 @@ void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4
     }
 }
 
-void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
 {
     assert(in != out);
 
@@ -1635,7 +1635,7 @@ void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *out, const
 
 
 ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
-    const v4sf *e, v4sf *out)
+    const v4sf *e, v4sf *RESTRICT out)
 {
     v4sf r0{*in0}, i0{*in1};
     v4sf r1{*in++}; v4sf i1{*in++};
@@ -1690,7 +1690,8 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
     *out++ = i3;
 }
 
-NEVER_INLINE(void) pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+NEVER_INLINE(void) pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
+    const v4sf *e)
 {
     static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
 
@@ -1728,7 +1729,7 @@ NEVER_INLINE(void) pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf
         pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
 }
 
-ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
+ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *RESTRICT out,
     const bool first)
 {
     v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
@@ -1780,7 +1781,8 @@ ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4s
     *out++ = i3;
 }
 
-NEVER_INLINE(void) pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+NEVER_INLINE(void) pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
+    const v4sf *e)
 {
     static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
 
@@ -1908,7 +1910,7 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
 
     const size_t N{setup->N}, Ncvec{setup->Ncvec};
     const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
-    v4sf *vout{reinterpret_cast<v4sf*>(out)};
+    v4sf *RESTRICT vout{reinterpret_cast<v4sf*>(out)};
     if(setup->transform == PFFFT_REAL)
     {
         const size_t dk{N/32};
@@ -2192,7 +2194,7 @@ void pffft_transform_internal_nosimd(const PFFFT_Setup *setup, const float *inpu
 } // namespace
 
 #define pffft_zreorder_nosimd pffft_zreorder
-void pffft_zreorder_nosimd(const PFFFT_Setup *setup, const float *in, float *out,
+void pffft_zreorder_nosimd(const PFFFT_Setup *setup, const float *in, float *RESTRICT out,
     pffft_direction_t direction)
 {
     const size_t N{setup->N};
-- 
cgit v1.2.3


From 5b3d6e629cd4145f302d8aea132ba406433dbf17 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Tue, 17 Oct 2023 05:48:43 -0700
Subject: Combine some more VADD(VMUL(... into VMADD(...

---
 common/pffft.cpp | 92 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 44 insertions(+), 48 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 7390a777..672a97a3 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -451,10 +451,10 @@ NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc,
         for(size_t i{0};i < ido-1;i += 2)
         {
             v4sf tr2{VADD(cc[i+ido], cc[i+2*ido])};
-            v4sf cr2{VADD(cc[i], VMUL(taur,tr2))};
-            ch[i]  = VADD(cc[i], tr2);
+            v4sf cr2{VMADD(taur, tr2, cc[i])};
+            ch[i]  = VADD(tr2, cc[i]);
             v4sf ti2{VADD(cc[i+ido+1], cc[i+2*ido+1])};
-            v4sf ci2{VADD(cc[i    +1], VMUL(taur,ti2))};
+            v4sf ci2{VMADD(taur, ti2, cc[i+1])};
             ch[i+1] = VADD(cc[i+1], ti2);
             v4sf cr3{VMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]))};
             v4sf ci3{VMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]))};
@@ -574,12 +574,12 @@ NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc,
             v4sf tr3{VADD(cc_ref(i-1, 3), cc_ref(i-1, 4))};
             ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
             ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
-            v4sf cr2{VADD(cc_ref(i-1, 1), VADD(VMUL(tr11, tr2),VMUL(tr12, tr3)))};
-            v4sf ci2{VADD(cc_ref(i  , 1), VADD(VMUL(tr11, ti2),VMUL(tr12, ti3)))};
-            v4sf cr3{VADD(cc_ref(i-1, 1), VADD(VMUL(tr12, tr2),VMUL(tr11, tr3)))};
-            v4sf ci3{VADD(cc_ref(i  , 1), VADD(VMUL(tr12, ti2),VMUL(tr11, ti3)))};
-            v4sf cr5{VADD(VMUL(ti11, tr5), VMUL(ti12, tr4))};
-            v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))};
+            v4sf cr2{VADD(cc_ref(i-1, 1), VMADD(tr11, tr2, VMUL(tr12, tr3)))};
+            v4sf ci2{VADD(cc_ref(i  , 1), VMADD(tr11, ti2, VMUL(tr12, ti3)))};
+            v4sf cr3{VADD(cc_ref(i-1, 1), VMADD(tr12, tr2, VMUL(tr11, tr3)))};
+            v4sf ci3{VADD(cc_ref(i  , 1), VMADD(tr12, ti2, VMUL(tr11, ti3)))};
+            v4sf cr5{VMADD(ti11, tr5, VMUL(ti12, tr4))};
+            v4sf ci5{VMADD(ti11, ti5, VMUL(ti12, ti4))};
             v4sf cr4{VSUB(VMUL(ti12, tr5), VMUL(ti11, tr4))};
             v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))};
             v4sf dr3{VSUB(cr3, ci4)};
@@ -702,9 +702,9 @@ void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
     for(size_t k{0};k < l1;++k)
     {
         v4sf cr2{VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
-        ch[3*k*ido] = VADD(cc[k*ido], cr2);
-        ch[(3*k+2)*ido] = VMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
-        ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(taur, cr2));
+        ch[        (3*k    )*ido] = VADD(cc[k*ido], cr2);
+        ch[        (3*k + 2)*ido] = VMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+        ch[ido-1 + (3*k + 1)*ido] = VMADD(taur, cr2, cc[k*ido]);
     }
     if(ido == 1)
         return;
@@ -716,27 +716,27 @@ void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf wr1{LD_PS1(wa1[i - 2])};
             v4sf wi1{LD_PS1(wa1[i - 1])};
             v4sf dr2{cc[i - 1 + (k + l1)*ido]};
-            v4sf di2{cc[i + (k + l1)*ido]};
+            v4sf di2{cc[i     + (k + l1)*ido]};
             vcplxmulconj(dr2, di2, wr1, wi1);
 
             v4sf wr2{LD_PS1(wa2[i - 2])};
             v4sf wi2{LD_PS1(wa2[i - 1])};
             v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
-            v4sf di3{cc[i + (k + l1*2)*ido]};
+            v4sf di3{cc[i     + (k + l1*2)*ido]};
             vcplxmulconj(dr3, di3, wr2, wi2);
 
             v4sf cr2{VADD(dr2, dr3)};
             v4sf ci2{VADD(di2, di3)};
             ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
-            ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
-            v4sf tr2{VADD(cc[i - 1 + k*ido], VMUL(taur, cr2))};
-            v4sf ti2{VADD(cc[i + k*ido], VMUL(taur, ci2))};
+            ch[i     + 3*k*ido] = VADD(cc[i     + k*ido], ci2);
+            v4sf tr2{VMADD(taur, cr2, cc[i - 1 + k*ido])};
+            v4sf ti2{VMADD(taur, ci2, cc[i     + k*ido])};
             v4sf tr3{VMUL(taui, VSUB(di2, di3))};
             v4sf ti3{VMUL(taui, VSUB(dr3, dr2))};
-            ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
+            ch[i  - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
             ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
-            ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
-            ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
+            ch[i      + (3*k + 2)*ido] = VADD(ti2, ti3);
+            ch[ic     + (3*k + 1)*ido] = VSUB(ti3, ti2);
         }
     }
 } /* radf3 */
@@ -850,17 +850,17 @@ NEVER_INLINE(void) radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
                 v4sf tr4{VSUB(cr4,cr2)};
                 v4sf tr2{VADD(pc[0],cr3)};
                 v4sf tr3{VSUB(pc[0],cr3)};
-                ch[i - 1 + 4*k] = VADD(tr1,tr2);
+                ch[i  - 1 + 4*k        ] = VADD(tr2,tr1);
                 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
                 v4sf ti1{VADD(ci2,ci4)};
                 v4sf ti4{VSUB(ci2,ci4)};
-                ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
+                ch[i  - 1 + 4*k + 2*ido] = VADD(tr3,ti4);
                 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
                 v4sf ti2{VADD(pc[1],ci3)};
                 v4sf ti3{VSUB(pc[1],ci3)};
-                ch[i + 4*k] = VADD(ti1, ti2);
+                ch[i  + 4*k        ] = VADD(ti1, ti2);
                 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
-                ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
+                ch[i  + 4*k + 2*ido] = VADD(tr4, ti3);
                 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
             }
         }
@@ -872,12 +872,12 @@ NEVER_INLINE(void) radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
     {
         v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]};
         v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]};
-        v4sf ti1{VMUL(minus_hsqt2, VADD(a, b))};
+        v4sf ti1{VMUL(minus_hsqt2, VADD(b, a))};
         v4sf tr1{VMUL(minus_hsqt2, VSUB(b, a))};
-        ch[ido-1 + 4*k] = VADD(tr1, c);
+        ch[ido-1 + 4*k        ] = VADD(c, tr1);
         ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
-        ch[4*k + 1*ido] = VSUB(ti1, d);
-        ch[4*k + 3*ido] = VADD(ti1, d);
+        ch[        4*k + 1*ido] = VSUB(ti1, d);
+        ch[        4*k + 3*ido] = VADD(ti1, d);
     }
 } /* radf4 */
 
@@ -918,8 +918,8 @@ NEVER_INLINE(void) radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
             v4sf *RESTRICT ph{ch + k + 1};
             for(size_t i{2};i < ido;i += 2)
             {
-                v4sf tr1{VSUB(pc[i], pc[4*ido - i])};
-                v4sf tr2{VADD(pc[i], pc[4*ido - i])};
+                v4sf tr1{VSUB(pc[        i], pc[4*ido - i])};
+                v4sf tr2{VADD(pc[        i], pc[4*ido - i])};
                 v4sf ti4{VSUB(pc[2*ido + i], pc[2*ido - i])};
                 v4sf tr3{VADD(pc[2*ido + i], pc[2*ido - i])};
                 ph[0] = VADD(tr2, tr3);
@@ -980,10 +980,8 @@ void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
 
     /* Parameter adjustments */
-    const size_t ch_offset{1 + ido * 6};
-    ch -= ch_offset;
-    const size_t cc_offset{1 + ido * (1 + l1)};
-    cc -= cc_offset;
+    ch -= 1 + ido * 6;
+    cc -= 1 + ido * (1 + l1);
 
     /* Function Body */
     for(size_t k{1};k <= l1;++k)
@@ -1000,7 +998,7 @@ void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
         //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
     }
     if(ido == 1)
-      return;
+        return;
 
     const size_t idp2{ido + 2};
     for(size_t k{1};k <= l1;++k)
@@ -1038,14 +1036,14 @@ void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf ti5{VMADD(ti11, ci5, VMUL(ti12, ci4))};
             v4sf tr4{VSUB(VMUL(ti12, cr5), VMUL(ti11, cr4))};
             v4sf ti4{VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4))};
-            ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+            ch_ref(i  - 1, 3, k) = VSUB(tr2, tr5);
             ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
-            ch_ref(i, 3, k) = VADD(ti2, ti5);
-            ch_ref(ic, 2, k) = VSUB(ti5, ti2);
-            ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+            ch_ref(i     , 3, k) = VADD(ti5, ti2);
+            ch_ref(ic    , 2, k) = VSUB(ti5, ti2);
+            ch_ref(i  - 1, 5, k) = VSUB(tr3, tr4);
             ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
-            ch_ref(i, 5, k) = VADD(ti3, ti4);
-            ch_ref(ic, 4, k) = VSUB(ti4, ti3);
+            ch_ref(i     , 5, k) = VADD(ti4, ti3);
+            ch_ref(ic    , 4, k) = VSUB(ti4, ti3);
         }
     }
 #undef cc_ref
@@ -1064,16 +1062,14 @@ void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
 
     /* Parameter adjustments */
-    const size_t ch_offset{1 + ido*(1 + l1)};
-    ch -= ch_offset;
-    const size_t cc_offset{1 + ido*6};
-    cc -= cc_offset;
+    ch -= 1 + ido*(1 + l1);
+    cc -= 1 + ido*6;
 
     /* Function Body */
     for(size_t k{1};k <= l1;++k)
     {
-        v4sf ti5{VADD(cc_ref(1, 3, k), cc_ref(1, 3, k))};
-        v4sf ti4{VADD(cc_ref(1, 5, k), cc_ref(1, 5, k))};
+        v4sf ti5{VADD(cc_ref(  1, 3, k), cc_ref(1, 3, k))};
+        v4sf ti4{VADD(cc_ref(  1, 5, k), cc_ref(1, 5, k))};
         v4sf tr2{VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k))};
         v4sf tr3{VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k))};
         ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
@@ -1104,7 +1100,7 @@ void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf tr4{VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
             v4sf tr3{VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
             ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
-            ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
+            ch_ref(i    , k, 1) = VADD(cc_ref(i  , 1, k), VADD(ti2, ti3));
             v4sf cr2{VADD(cc_ref(i-1, 1, k), VMADD(tr11, tr2, VMUL(tr12, tr3)))};
             v4sf ci2{VADD(cc_ref(i  , 1, k), VMADD(tr11, ti2, VMUL(tr12, ti3)))};
             v4sf cr3{VADD(cc_ref(i-1, 1, k), VMADD(tr12, tr2, VMUL(tr11, tr3)))};
-- 
cgit v1.2.3


From 6ac36a9ac3821199fccf8fe967a00fb48c7b8f9e Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Thu, 26 Oct 2023 18:18:54 -0700
Subject: Remove unnecessary extra macros

---
 common/pffft.cpp | 92 +++++++++++++++++++++++---------------------------------
 1 file changed, 37 insertions(+), 55 deletions(-)

(limited to 'common/pffft.cpp')

diff --git a/common/pffft.cpp b/common/pffft.cpp
index 672a97a3..71f71fa6 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -76,24 +76,6 @@ namespace {
 
 using uint = unsigned int;
 
-#if defined(__GNUC__)
-#define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
-#define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
-#define RESTRICT __restrict
-
-#elif defined(_MSC_VER)
-
-#define ALWAYS_INLINE(return_type) __forceinline return_type
-#define NEVER_INLINE(return_type) __declspec(noinline) return_type
-#define RESTRICT __restrict
-
-#else
-
-#define ALWAYS_INLINE(return_type) inline return_type
-#define NEVER_INLINE(return_type) return_type
-#define RESTRICT
-#endif
-
 
 /* Vector support macros: the rest of the code is independent of
  * SSE/Altivec/NEON -- adding support for other platforms with 4-element
@@ -116,7 +98,7 @@ typedef vector float v4sf;
 #define VMADD vec_madd
 #define VSUB vec_sub
 #define LD_PS1 vec_splats
-ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
+force_inline v4sf vset4(float a, float b, float c, float d) noexcept
 {
     /* There a more efficient way to do this? */
     alignas(16) std::array<float,4> vals{{a, b, c, d}};
@@ -126,20 +108,20 @@ ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
 #define VINSERT0(v, a) vec_insert((a), (v), 0)
 #define VEXTRACT0(v) vec_extract((v), 0)
 
-ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{vec_mergeh(in1, in2)};
     out2 = vec_mergel(in1, in2);
     out1 = tmp;
 }
-ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{vec_perm(in1, in2, (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27))};
     out2 = vec_perm(in1, in2, (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31));
     out1 = tmp;
 }
 
-ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 {
     v4sf y0{vec_mergeh(x0, x2)};
     v4sf y1{vec_mergel(x0, x2)};
@@ -172,20 +154,20 @@ typedef __m128 v4sf;
 #define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a))
 #define VEXTRACT0 _mm_cvtss_f32
 
-ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{_mm_unpacklo_ps(in1, in2)};
     out2 = _mm_unpackhi_ps(in1, in2);
     out1 = tmp;
 }
-ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{_mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0))};
     out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
     out1 = tmp;
 }
 
-ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 { _MM_TRANSPOSE4_PS(x0, x1, x2, x3); }
 
 #define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
@@ -204,7 +186,7 @@ typedef float32x4_t v4sf;
 #define VMADD(a,b,c) vmlaq_f32(c,a,b)
 #define VSUB vsubq_f32
 #define LD_PS1 vdupq_n_f32
-ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
+force_inline v4sf vset4(float a, float b, float c, float d) noexcept
 {
     float32x4_t ret{vmovq_n_f32(a)};
     ret = vsetq_lane_f32(b, ret, 1);
@@ -216,20 +198,20 @@ ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
 #define VINSERT0(v, a) vsetq_lane_f32((a), (v), 0)
 #define VEXTRACT0(v) vgetq_lane_f32((v), 0)
 
-ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     float32x4x2_t tmp{vzipq_f32(in1, in2)};
     out1 = tmp.val[0];
     out2 = tmp.val[1];
 }
-ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     float32x4x2_t tmp{vuzpq_f32(in1, in2)};
     out1 = tmp.val[0];
     out2 = tmp.val[1];
 }
 
-ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 {
     /* marginally faster version:
      * asm("vtrn.32 %q0, %q1;\n"
@@ -263,33 +245,33 @@ using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
 #define VMADD(a,b,c) ((a)*(b) + (c))
 #define VSUB(a,b) ((a) - (b))
 
-constexpr ALWAYS_INLINE(v4sf) ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
+constexpr force_inline v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 #define LD_PS1 ld_ps1
 #define VSET4(a, b, c, d) v4sf{(a), (b), (c), (d)}
-constexpr ALWAYS_INLINE(v4sf) vinsert0(v4sf v, float a) noexcept
+constexpr force_inline v4sf vinsert0(v4sf v, float a) noexcept
 { return v4sf{a, v[1], v[2], v[3]}; }
 #define VINSERT0 vinsert0
 #define VEXTRACT0(v) ((v)[0])
 
-ALWAYS_INLINE(v4sf) unpacklo(v4sf a, v4sf b) noexcept
+force_inline v4sf unpacklo(v4sf a, v4sf b) noexcept
 { return v4sf{a[0], b[0], a[1], b[1]}; }
-ALWAYS_INLINE(v4sf) unpackhi(v4sf a, v4sf b) noexcept
+force_inline v4sf unpackhi(v4sf a, v4sf b) noexcept
 { return v4sf{a[2], b[2], a[3], b[3]}; }
 
-ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{unpacklo(in1, in2)};
     out2 = unpackhi(in1, in2);
     out1 = tmp;
 }
-ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{in1[0], in1[2], in2[0], in2[2]};
     out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
     out1 = tmp;
 }
 
-ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 {
     v4sf tmp0{unpacklo(x0, x1)};
     v4sf tmp2{unpacklo(x2, x3)};
@@ -301,7 +283,7 @@ ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
     x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
 }
 
-ALWAYS_INLINE(v4sf) vswaphl(v4sf a, v4sf b) noexcept
+force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 { return v4sf{b[0], b[1], a[2], a[3]}; }
 #define VSWAPHL vswaphl
 
@@ -332,13 +314,13 @@ inline bool valigned(const float *ptr) noexcept
 }
 
 // shortcuts for complex multiplications
-ALWAYS_INLINE(void) vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
+force_inline void vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 {
     v4sf tmp{VMUL(ar, bi)};
     ar = VSUB(VMUL(ar, br), VMUL(ai, bi));
     ai = VMADD(ai, br, tmp);
 }
-ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
+force_inline void vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 {
     v4sf tmp{VMUL(ar, bi)};
     ar = VMADD(ai, bi, VMUL(ar, br));
@@ -402,7 +384,7 @@ ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 /*
   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
 */
-NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
+NOINLINE void passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1, const float fsign)
 {
     const size_t l1ido{l1*ido};
@@ -438,7 +420,7 @@ NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc,
 /*
   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
 */
-NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
+NOINLINE void passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float fsign)
 {
     assert(ido > 2);
@@ -473,7 +455,7 @@ NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc,
     }
 } /* passf3 */
 
-NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
+NOINLINE void passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float *wa3, const float fsign)
 {
     /* fsign == -1 for forward transform and +1 for backward transform */
@@ -548,7 +530,7 @@ NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc,
 /*
  * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
  */
-NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
+NOINLINE void passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1, const float *wa2, const float *wa3, const float *wa4, const float fsign)
 {
     const v4sf tr11{LD_PS1(0.309016994374947f)};
@@ -610,7 +592,7 @@ NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc,
 #undef cc_ref
 }
 
-NEVER_INLINE(void) radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
+NOINLINE void radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
     v4sf *RESTRICT ch, const float *wa1)
 {
     const size_t l1ido{l1*ido};
@@ -649,7 +631,7 @@ NEVER_INLINE(void) radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTR
 } /* radf2 */
 
 
-NEVER_INLINE(void) radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
+NOINLINE void radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
     const float *wa1)
 {
     const size_t l1ido{l1*ido};
@@ -791,7 +773,7 @@ void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
     }
 } /* radb3 */
 
-NEVER_INLINE(void) radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
+NOINLINE void radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
     v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
     const float *RESTRICT wa3)
 {
@@ -882,7 +864,7 @@ NEVER_INLINE(void) radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
 } /* radf4 */
 
 
-NEVER_INLINE(void) radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
+NOINLINE void radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
     v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
     const float *RESTRICT wa3)
 {
@@ -1132,8 +1114,8 @@ void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
 #undef ch_ref
 } /* radb5 */
 
-NEVER_INLINE(v4sf *) rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1,
-    v4sf *work2, const float *wa, const al::span<const uint,15> ifac)
+NOINLINE v4sf *rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+    const float *wa, const al::span<const uint,15> ifac)
 {
     assert(work1 != work2);
 
@@ -1194,8 +1176,8 @@ NEVER_INLINE(v4sf *) rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf
     return const_cast<v4sf*>(in); /* this is in fact the output .. */
 } /* rfftf1 */
 
-NEVER_INLINE(v4sf *) rfftb1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1,
-    v4sf *work2, const float *wa, const al::span<const uint,15> ifac)
+NOINLINE v4sf *rfftb1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+    const float *wa, const al::span<const uint,15> ifac)
 {
     assert(work1 != work2);
 
@@ -1630,7 +1612,7 @@ void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT ou
 }
 
 
-ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
+force_inline void pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
     const v4sf *e, v4sf *RESTRICT out)
 {
     v4sf r0{*in0}, i0{*in1};
@@ -1686,7 +1668,7 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
     *out++ = i3;
 }
 
-NEVER_INLINE(void) pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
+NOINLINE void pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
     const v4sf *e)
 {
     static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
@@ -1725,7 +1707,7 @@ NEVER_INLINE(void) pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf
         pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
 }
 
-ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *RESTRICT out,
+force_inline void pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *RESTRICT out,
     const bool first)
 {
     v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
@@ -1777,7 +1759,7 @@ ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4s
     *out++ = i3;
 }
 
-NEVER_INLINE(void) pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
+NOINLINE void pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
     const v4sf *e)
 {
     static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
-- 
cgit v1.2.3