aboutsummaryrefslogtreecommitdiffstats
path: root/common/pffft.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'common/pffft.cpp')
-rw-r--r--common/pffft.cpp79
1 files changed, 76 insertions, 3 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 7e5ba5c3..f8568acf 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -1904,7 +1904,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
}
}
-void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
float scaling)
{
const size_t Ncvec{s->Ncvec};
@@ -2006,6 +2006,59 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
}
}
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+{
+ const size_t Ncvec{s->Ncvec};
+ const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
+ const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
+ v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
+
+#ifdef __arm__
+ __builtin_prefetch(va);
+ __builtin_prefetch(vb);
+ __builtin_prefetch(vab);
+ __builtin_prefetch(va+2);
+ __builtin_prefetch(vb+2);
+ __builtin_prefetch(vab+2);
+ __builtin_prefetch(va+4);
+ __builtin_prefetch(vb+4);
+ __builtin_prefetch(vab+4);
+ __builtin_prefetch(va+6);
+ __builtin_prefetch(vb+6);
+ __builtin_prefetch(vab+6);
+#endif
+
+ const float ar1{VEXTRACT0(va[0])};
+ const float ai1{VEXTRACT0(va[1])};
+ const float br1{VEXTRACT0(vb[0])};
+ const float bi1{VEXTRACT0(vb[1])};
+ const float abr1{VEXTRACT0(vab[0])};
+ const float abi1{VEXTRACT0(vab[1])};
+
+ /* No inline assembly for this version. I'm not familiar enough with NEON
+ * assembly, and I don't know that it's needed with today's optimizers.
+ */
+ for(size_t i{0};i < Ncvec;i += 2)
+ {
+ v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
+ v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
+ VCPLXMUL(ar4, ai4, br4, bi4);
+ vab[2*i+0] = VADD(ar4, vab[2*i+0]);
+ vab[2*i+1] = VADD(ai4, vab[2*i+1]);
+ ar4 = va[2*i+2]; ai4 = va[2*i+3];
+ br4 = vb[2*i+2]; bi4 = vb[2*i+3];
+ VCPLXMUL(ar4, ai4, br4, bi4);
+ vab[2*i+2] = VADD(ar4, vab[2*i+2]);
+ vab[2*i+3] = VADD(ai4, vab[2*i+3]);
+ }
+
+ if(s->transform == PFFFT_REAL)
+ {
+ vab[0] = VINSERT0(vab[0], abr1 + ar1*br1);
+ vab[1] = VINSERT0(vab[1], abi1 + ai1*bi1);
+ }
+}
+
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{
@@ -2115,8 +2168,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
}
}
-#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
-void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab,
+void pffft_zconvolve_scale_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
float scaling)
{
size_t Ncvec{s->Ncvec};
@@ -2138,6 +2190,27 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo
}
}
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab)
+{
+ size_t Ncvec{s->Ncvec};
+
+ if(s->transform == PFFFT_REAL)
+ {
+ // take care of the fftpack ordering
+ ab[0] += a[0]*b[0];
+ ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1];
+ ++ab; ++a; ++b; --Ncvec;
+ }
+ for(size_t i{0};i < Ncvec;++i)
+ {
+ float ar{a[2*i+0]}, ai{a[2*i+1]};
+ const float br{b[2*i+0]}, bi{b[2*i+1]};
+ VCPLXMUL(ar, ai, br, bi);
+ ab[2*i+0] += ar;
+ ab[2*i+1] += ai;
+ }
+}
+
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{