aboutsummaryrefslogtreecommitdiffstats
path: root/Alc/mixer_sse.c
diff options
context:
space:
mode:
Diffstat (limited to 'Alc/mixer_sse.c')
-rw-r--r--Alc/mixer_sse.c214
1 files changed, 116 insertions, 98 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index c4e1fdf5..090b7a5a 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -1,12 +1,5 @@
#include "config.h"
-#ifdef IN_IDE_PARSER
-/* KDevelop's parser won't recognize these defines that get added by the -msse
- * switch used to compile this source. Without them, xmmintrin.h fails to
- * declare anything. */
-#define __MMX__
-#define __SSE__
-#endif
#include <xmmintrin.h>
#include "AL/al.h"
@@ -19,6 +12,82 @@
#include "mixer_defs.h"
+const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *src, ALuint frac,
+ ALuint increment, ALfloat *restrict dst, ALuint dstlen)
+{
+ const __m128 sf4 = _mm_set1_ps(state->sf);
+ const ALuint m = state->m;
+ const ALint l = state->l;
+ const ALfloat *fil, *scd, *phd, *spd;
+ ALuint pi, j_f, i;
+ ALfloat pf;
+ ALint j_s;
+ __m128 r4;
+
+ for(i = 0;i < dstlen;i++)
+ {
+ // Calculate the phase index and factor.
+#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
+ pi = frac >> FRAC_PHASE_BITDIFF;
+ pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
+#undef FRAC_PHASE_BITDIFF
+
+ fil = state->coeffs[pi].filter;
+ scd = state->coeffs[pi].scDelta;
+ phd = state->coeffs[pi].phDelta;
+ spd = state->coeffs[pi].spDelta;
+
+ // Apply the scale and phase interpolated filter.
+ r4 = _mm_setzero_ps();
+ {
+ const __m128 pf4 = _mm_set1_ps(pf);
+ for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4)
+ {
+ const __m128 f4 = _mm_add_ps(
+ _mm_add_ps(
+ _mm_load_ps(&fil[j_f]),
+ _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f]))
+ ),
+ _mm_mul_ps(
+ pf4,
+ _mm_add_ps(
+ _mm_load_ps(&phd[j_f]),
+ _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f]))
+ )
+ )
+ );
+ r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s])));
+ }
+ }
+ r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
+ r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+ dst[i] = _mm_cvtss_f32(r4);
+
+ frac += increment;
+ src += frac>>FRACTIONBITS;
+ frac &= FRACTIONMASK;
+ }
+ return dst;
+}
+
+
+static inline void SetupCoeffs(ALfloat (*restrict OutCoeffs)[2],
+ const HrtfParams *hrtfparams,
+ ALuint IrSize, ALuint Counter)
+{
+ const __m128 counter4 = _mm_set1_ps((float)Counter);
+ __m128 coeffs, step4;
+ ALuint i;
+
+ for(i = 0;i < IrSize;i += 2)
+ {
+ step4 = _mm_load_ps(&hrtfparams->CoeffStep[i][0]);
+ coeffs = _mm_load_ps(&hrtfparams->Coeffs[i][0]);
+ coeffs = _mm_sub_ps(coeffs, _mm_mul_ps(step4, counter4));
+ _mm_store_ps(&OutCoeffs[i][0], coeffs);
+ }
+}
+
static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
const ALuint IrSize,
ALfloat (*restrict Coeffs)[2],
@@ -133,129 +202,78 @@ static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
}
}
-#define SUFFIX SSE
+#define MixHrtf MixHrtf_SSE
#include "mixer_inc.c"
-#undef SUFFIX
+#undef MixHrtf
-void MixDirect_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
- MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
+void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
+ MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
{
- ALfloat DrySend, Step;
- __m128 gain, step;
+ ALfloat gain, step;
+ __m128 gain4;
ALuint c;
- for(c = 0;c < MaxChannels;c++)
+ for(c = 0;c < OutChans;c++)
{
ALuint pos = 0;
- DrySend = Gains->Current[c];
- Step = Gains->Step[c];
- if(Step != 1.0f && Counter > 0)
+ gain = Gains[c].Current;
+ step = Gains[c].Step;
+ if(step != 0.0f && Counter > 0)
{
+ ALuint minsize = minu(BufferSize, Counter);
/* Mix with applying gain steps in aligned multiples of 4. */
- if(BufferSize-pos > 3 && Counter-pos > 3)
+ if(minsize-pos > 3)
{
- gain = _mm_setr_ps(
- DrySend,
- DrySend * Step,
- DrySend * Step * Step,
- DrySend * Step * Step * Step
+ __m128 step4;
+ gain4 = _mm_setr_ps(
+ gain,
+ gain + step,
+ gain + step + step,
+ gain + step + step + step
);
- step = _mm_set1_ps(Step * Step * Step * Step);
+ step4 = _mm_set1_ps(step + step + step + step);
do {
const __m128 val4 = _mm_load_ps(&data[pos]);
__m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
- dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
- gain = _mm_mul_ps(gain, step);
+ dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
+ gain4 = _mm_add_ps(gain4, step4);
_mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
pos += 4;
- } while(BufferSize-pos > 3 && Counter-pos > 3);
- DrySend = _mm_cvtss_f32(gain);
+ } while(minsize-pos > 3);
+ /* NOTE: gain4 now represents the next four gains after the
+ * last four mixed samples, so the lowest element represents
+ * the next gain to apply.
+ */
+ gain = _mm_cvtss_f32(gain4);
}
/* Mix with applying left over gain steps that aren't aligned multiples of 4. */
- for(;pos < BufferSize && pos < Counter;pos++)
+ for(;pos < minsize;pos++)
{
- OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
- DrySend *= Step;
+ OutBuffer[c][OutPos+pos] += data[pos]*gain;
+ gain += step;
}
if(pos == Counter)
- DrySend = Gains->Target[c];
- Gains->Current[c] = DrySend;
+ gain = Gains[c].Target;
+ Gains[c].Current = gain;
+
/* Mix until pos is aligned with 4 or the mix is done. */
- for(;pos < BufferSize && (pos&3) != 0;pos++)
- OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
+ minsize = minu(BufferSize, (pos+3)&~3);
+ for(;pos < minsize;pos++)
+ OutBuffer[c][OutPos+pos] += data[pos]*gain;
}
- if(!(DrySend > GAIN_SILENCE_THRESHOLD))
+ if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
continue;
- gain = _mm_set1_ps(DrySend);
+ gain4 = _mm_set1_ps(gain);
for(;BufferSize-pos > 3;pos += 4)
{
const __m128 val4 = _mm_load_ps(&data[pos]);
__m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
- dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
+ dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
_mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
}
for(;pos < BufferSize;pos++)
- OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
- }
-}
-
-
-void MixSend_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
- MixGainMono *Gain, ALuint Counter, ALuint OutPos, ALuint BufferSize)
-{
- ALfloat WetGain, Step;
- __m128 gain, step;
-
- {
- ALuint pos = 0;
- WetGain = Gain->Current;
- Step = Gain->Step;
- if(Step != 1.0f && Counter > 0)
- {
- if(BufferSize-pos > 3 && Counter-pos > 3)
- {
- gain = _mm_setr_ps(
- WetGain,
- WetGain * Step,
- WetGain * Step * Step,
- WetGain * Step * Step * Step
- );
- step = _mm_set1_ps(Step * Step * Step * Step);
- do {
- const __m128 val4 = _mm_load_ps(&data[pos]);
- __m128 dry4 = _mm_load_ps(&OutBuffer[0][OutPos+pos]);
- dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
- gain = _mm_mul_ps(gain, step);
- _mm_store_ps(&OutBuffer[0][OutPos+pos], dry4);
- pos += 4;
- } while(BufferSize-pos > 3 && Counter-pos > 3);
- WetGain = _mm_cvtss_f32(gain);
- }
- for(;pos < BufferSize && pos < Counter;pos++)
- {
- OutBuffer[0][OutPos+pos] += data[pos]*WetGain;
- WetGain *= Step;
- }
- if(pos == Counter)
- WetGain = Gain->Target;
- Gain->Current = WetGain;
- for(;pos < BufferSize && (pos&3) != 0;pos++)
- OutBuffer[0][OutPos+pos] += data[pos]*WetGain;
- }
-
- if(!(WetGain > GAIN_SILENCE_THRESHOLD))
- return;
- gain = _mm_set1_ps(WetGain);
- for(;BufferSize-pos > 3;pos += 4)
- {
- const __m128 val4 = _mm_load_ps(&data[pos]);
- __m128 wet4 = _mm_load_ps(&OutBuffer[0][OutPos+pos]);
- wet4 = _mm_add_ps(wet4, _mm_mul_ps(val4, gain));
- _mm_store_ps(&OutBuffer[0][OutPos+pos], wet4);
- }
- for(;pos < BufferSize;pos++)
- OutBuffer[0][OutPos+pos] += data[pos] * WetGain;
+ OutBuffer[c][OutPos+pos] += data[pos]*gain;
}
}