diff options
Diffstat (limited to 'Alc/mixer_sse.c')
-rw-r--r-- | Alc/mixer_sse.c | 120 |
1 files changed, 40 insertions, 80 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c index 3c45fd21..719ebd23 100644 --- a/Alc/mixer_sse.c +++ b/Alc/mixer_sse.c @@ -1,6 +1,13 @@ #include "config.h" #ifdef HAVE_XMMINTRIN_H +#ifdef IN_IDE_PARSER +/* KDevelop's parser won't recognize these defines that get added by the -msse + * switch used to compile this source. Without them, xmmintrin.h fails to + * declare anything. */ +#define __MMX__ +#define __SSE__ +#endif #include <xmmintrin.h> #endif @@ -14,72 +21,26 @@ #include "mixer_defs.h" -static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2], - const ALuint IrSize, - ALfloat (*RESTRICT Coeffs)[2], - const ALfloat (*RESTRICT CoeffStep)[2], - ALfloat left, ALfloat right) +static inline void ApplyCoeffsStep(const ALuint IrSize, + ALfloat (*restrict Coeffs)[2], + const ALfloat (*restrict CoeffStep)[2]) { - const __m128 lrlr = { left, right, left, right }; - __m128 coeffs, deltas, imp0, imp1; - __m128 vals = _mm_setzero_ps(); + __m128 coeffs, deltas; ALuint i; - if((Offset&1)) + for(i = 0;i < IrSize;i += 2) { - const ALuint o0 = Offset&HRIR_MASK; - const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[0][0]); - deltas = _mm_load_ps(&CoeffStep[0][0]); - vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]); - imp0 = _mm_mul_ps(lrlr, coeffs); + coeffs = _mm_load_ps(&Coeffs[i][0]); + deltas = _mm_load_ps(&CoeffStep[i][0]); coeffs = _mm_add_ps(coeffs, deltas); - vals = _mm_add_ps(imp0, vals); - _mm_store_ps(&Coeffs[0][0], coeffs); - _mm_storel_pi((__m64*)&Values[o0][0], vals); - for(i = 1;i < IrSize-1;i += 2) - { - const ALuint o2 = (Offset+i)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[i+1][0]); - deltas = _mm_load_ps(&CoeffStep[i+1][0]); - vals = _mm_load_ps(&Values[o2][0]); - imp1 = _mm_mul_ps(lrlr, coeffs); - coeffs = _mm_add_ps(coeffs, deltas); - imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2)); - vals = _mm_add_ps(imp0, vals); - _mm_store_ps(&Coeffs[i+1][0], coeffs); - _mm_store_ps(&Values[o2][0], vals); - imp0 = imp1; - } - vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]); - imp0 = _mm_movehl_ps(imp0, imp0); - vals = _mm_add_ps(imp0, vals); - _mm_storel_pi((__m64*)&Values[o1][0], vals); - } - else - { - for(i = 0;i < IrSize;i += 2) - { - const ALuint o = (Offset + i)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[i][0]); - deltas = _mm_load_ps(&CoeffStep[i][0]); - vals = _mm_load_ps(&Values[o][0]); - imp0 = _mm_mul_ps(lrlr, coeffs); - coeffs = _mm_add_ps(coeffs, deltas); - vals = _mm_add_ps(imp0, vals); - _mm_store_ps(&Coeffs[i][0], coeffs); - _mm_store_ps(&Values[o][0], vals); - } + _mm_store_ps(&Coeffs[i][0], coeffs); } } -static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2], - const ALuint IrSize, - ALfloat (*RESTRICT Coeffs)[2], - ALfloat left, ALfloat right) +static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2], + const ALuint IrSize, + ALfloat (*restrict Coeffs)[2], + ALfloat left, ALfloat right) { const __m128 lrlr = { left, right, left, right }; __m128 vals = _mm_setzero_ps(); @@ -133,22 +94,21 @@ static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2], #undef SUFFIX -void MixDirect_SSE(const DirectParams *params, const ALfloat *RESTRICT data, ALuint srcchan, +void MixDirect_SSE(const DirectParams *params, const ALfloat *restrict data, ALuint srcchan, ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize) { - ALfloat (*RESTRICT DryBuffer)[BUFFERSIZE] = params->OutBuffer; - ALfloat *RESTRICT ClickRemoval = params->ClickRemoval; - ALfloat *RESTRICT PendingClicks = params->PendingClicks; + ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer; + ALfloat *restrict ClickRemoval = params->ClickRemoval; + ALfloat *restrict PendingClicks = params->PendingClicks; ALfloat DrySend; + __m128 gain; ALuint pos; ALuint c; for(c = 0;c < MaxChannels;c++) { - __m128 gain; - DrySend = params->Gains[srcchan][c]; - if(DrySend < 0.00001f) + if(!(DrySend > GAIN_SILENCE_THRESHOLD)) continue; if(OutPos == 0) @@ -158,12 +118,12 @@ void MixDirect_SSE(const DirectParams *params, const ALfloat *RESTRICT data, ALu for(pos = 0;BufferSize-pos > 3;pos += 4) { const __m128 val4 = _mm_load_ps(&data[pos]); - __m128 dry4 = _mm_load_ps(&DryBuffer[c][OutPos+pos]); + __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]); dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain)); - _mm_store_ps(&DryBuffer[c][OutPos+pos], dry4); + _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4); } for(;pos < BufferSize;pos++) - DryBuffer[c][OutPos+pos] += data[pos]*DrySend; + OutBuffer[c][OutPos+pos] += data[pos]*DrySend; if(OutPos+pos == SamplesToDo) PendingClicks[c] += data[pos]*DrySend; @@ -171,34 +131,34 @@ void MixDirect_SSE(const DirectParams *params, const ALfloat *RESTRICT data, ALu } -void MixSend_SSE(const SendParams *params, const ALfloat *RESTRICT data, +void MixSend_SSE(const SendParams *params, const ALfloat *restrict data, ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize) { - ALeffectslot *Slot = params->Slot; - ALfloat (*RESTRICT WetBuffer)[BUFFERSIZE] = Slot->WetBuffer; - ALfloat *RESTRICT WetClickRemoval = Slot->ClickRemoval; - ALfloat *RESTRICT WetPendingClicks = Slot->PendingClicks; - const ALfloat WetGain = params->Gain; + ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer; + ALfloat *restrict ClickRemoval = params->ClickRemoval; + ALfloat *restrict PendingClicks = params->PendingClicks; + ALfloat WetGain; __m128 gain; ALuint pos; - if(WetGain < 0.00001f) + WetGain = params->Gain; + if(!(WetGain > GAIN_SILENCE_THRESHOLD)) return; if(OutPos == 0) - WetClickRemoval[0] -= data[0] * WetGain; + ClickRemoval[0] -= data[0] * WetGain; gain = _mm_set1_ps(WetGain); for(pos = 0;BufferSize-pos > 3;pos += 4) { const __m128 val4 = _mm_load_ps(&data[pos]); - __m128 wet4 = _mm_load_ps(&WetBuffer[0][OutPos+pos]); + __m128 wet4 = _mm_load_ps(&OutBuffer[0][OutPos+pos]); wet4 = _mm_add_ps(wet4, _mm_mul_ps(val4, gain)); - _mm_store_ps(&WetBuffer[0][OutPos+pos], wet4); + _mm_store_ps(&OutBuffer[0][OutPos+pos], wet4); } for(;pos < BufferSize;pos++) - WetBuffer[0][OutPos+pos] += data[pos] * WetGain; + OutBuffer[0][OutPos+pos] += data[pos] * WetGain; if(OutPos+pos == SamplesToDo) - WetPendingClicks[0] += data[pos] * WetGain; + PendingClicks[0] += data[pos] * WetGain; } |