1 files changed, 83 insertions, 59 deletions
diff --git a/core/voice.cpp b/core/voice.cpp
index 03b56317..c02210ef 100644
--- a/core/voice.cpp
+++ b/core/voice.cpp
@@ -298,7 +298,7 @@ inline void LoadSamples<FmtIMA4>(float *RESTRICT dstSamples, const al::byte *src
         sample = (sample^0x8000) - 32768;
         index = clampi((index^0x8000) - 32768, 0, al::size(IMAStep_size)-1);
 
-        if(!skip) [[likely]]
+        if(skip == 0)
         {
             dstSamples[wrote++] = static_cast<float>(sample) / 32768.0f;
             if(wrote == samplesToLoad) return;
@@ -306,38 +306,50 @@ inline void LoadSamples<FmtIMA4>(float *RESTRICT dstSamples, const al::byte *src
         else
             --skip;
 
-        int samples[8]{};
+        auto decode_sample = [&sample,&index](const uint nibble)
+        {
+            sample += IMA4Codeword[nibble] * IMAStep_size[index] / 8;
+            sample = clampi(sample, -32768, 32767);
+
+            index += IMA4Index_adjust[nibble];
+            index = clampi(index, 0, al::size(IMAStep_size)-1);
+
+            return sample;
+        };
+
+        /* The rest of the block is arranged as a series of nibbles, contained
+         * in 4 *bytes* per channel interleaved. So we can decode a series of 8
+         * samples at once from each of these 4 bytes.
+         *
+         * First, decode the 8 sample sets being skipped entirely (they still
+         * need to be decoded for proper state on the remaining samples).
+         */
+        const size_t startOffset{(skip&~size_t{7}) + 1};
         const al::byte *nibbleData{src + (srcStep+srcChan)*4};
-        for(size_t i{1};i < samplesPerBlock;i+=8)
+        for(;skip >= 8;skip-=8)
         {
-            /* The rest of the block is arranged as a series of nibbles, with 4
-             * bytes per channel interleaved. So we can decode a series of 8
-             * samples at once from these next 4 bytes.
-             */
             uint code{uint{nibbleData[0]} | (uint{nibbleData[1]} << 8)
                 | (uint{nibbleData[2]} << 16) | (uint{nibbleData[3]} << 24)};
+            nibbleData += 4*srcStep;
+
             for(size_t j{0};j < 8;++j)
             {
-                const uint nibble{code & 0xf};
+                std::ignore = decode_sample(code & 0xf);
                 code >>= 4;
-
-                sample += IMA4Codeword[nibble] * IMAStep_size[index] / 8;
-                sample = clampi(sample, -32768, 32767);
-                samples[j] = sample;
-
-                index += IMA4Index_adjust[nibble];
-                index = clampi(index, 0, al::size(IMAStep_size)-1);
             }
+        }
+
+        int samples[8]{};
+        for(size_t i{startOffset};i < samplesPerBlock;i+=8)
+        {
+            uint code{uint{nibbleData[0]} | (uint{nibbleData[1]} << 8)
+                | (uint{nibbleData[2]} << 16) | (uint{nibbleData[3]} << 24)};
             nibbleData += 4*srcStep;
 
-            /* If we're skipping these 8 samples, go on to the next set. They
-             * still need to be decoded to update the predictor state for the
-             * next set.
-             */
-            if(skip >= 8)
+            for(size_t j{0};j < 8;++j)
             {
-                skip -= 8;
-                continue;
+                samples[j] = decode_sample(code & 0xf);
+                code >>= 4;
             }
 
             const size_t todo{minz(8-skip, samplesToLoad-wrote)};
@@ -389,28 +401,58 @@ inline void LoadSamples<FmtMSADPCM>(float *RESTRICT dstSamples, const al::byte *
         /* The second history sample is "older", so it's the first to be
          * written out.
          */
-        if(skip < 2) [[likely]]
+        if(skip == 0)
         {
-            if(!skip) [[likely]]
-            {
-                dstSamples[wrote++] = static_cast<float>(sampleHistory[1]) / 32768.0f;
-                if(wrote == samplesToLoad) return;
-            }
-            else
-                --skip;
+            dstSamples[wrote++] = static_cast<float>(sampleHistory[1]) / 32768.0f;
+            if(wrote == samplesToLoad) return;
+            dstSamples[wrote++] = static_cast<float>(sampleHistory[0]) / 32768.0f;
+            if(wrote == samplesToLoad) return;
+        }
+        else if(skip == 1)
+        {
+            --skip;
             dstSamples[wrote++] = static_cast<float>(sampleHistory[0]) / 32768.0f;
             if(wrote == samplesToLoad) return;
         }
         else
             skip -= 2;
 
-        int samples[8]{};
+        auto decode_sample = [&sampleHistory,&delta,blockpred](const int nibble)
+        {
+            int pred{(sampleHistory[0]*MSADPCMAdaptionCoeff[blockpred][0] +
+                sampleHistory[1]*MSADPCMAdaptionCoeff[blockpred][1]) / 256};
+            pred += ((nibble^0x08) - 0x08) * delta;
+            pred  = clampi(pred, -32768, 32767);
+
+            sampleHistory[1] = sampleHistory[0];
+            sampleHistory[0] = pred;
+
+            delta = (MSADPCMAdaption[nibble] * delta) / 256;
+            delta = maxi(16, delta);
+
+            return pred;
+        };
+
+        /* The rest of the block is a series of nibbles, interleaved per-
+         * channel. Decode the number of samples that we need to skip in the
+         * block.
+         */
+        const size_t startOffset{skip + 2};
         size_t nibbleOffset{srcChan};
-        for(size_t i{2};i < samplesPerBlock;)
+        for(;skip;--skip)
         {
-            /* The rest of the block is a series of nibbles, interleaved per-
-             * channel. Here we decode a set of (up to) 8 samples at a time to
-             * write out together.
+            const size_t byteOffset{nibbleOffset>>1};
+            const size_t byteShift{((nibbleOffset&1)^1) * 4};
+            std::ignore = decode_sample((input[byteOffset]>>byteShift) & 15);
+            nibbleOffset += srcStep;
+        }
+
+        int samples[8]{};
+        for(size_t i{startOffset};i < samplesPerBlock;)
+        {
+            /* Here we decode a set of (up to) 8 samples at a time to write out
+             * together. This is more efficient than decoding each sample
+             * individually and checking for the end each time.
              */
             const size_t todo{minz(samplesPerBlock-i, 8)};
 
@@ -418,33 +460,15 @@ inline void LoadSamples<FmtMSADPCM>(float *RESTRICT dstSamples, const al::byte *
             {
                 const size_t byteOffset{nibbleOffset>>1};
                 const size_t byteShift{((nibbleOffset&1)^1) * 4};
-                const int nibble{(input[byteOffset]>>byteShift) & 15};
+                samples[j] = decode_sample((input[byteOffset]>>byteShift) & 15);
                 nibbleOffset += srcStep;
-
-                int pred{(sampleHistory[0]*MSADPCMAdaptionCoeff[blockpred][0] +
-                    sampleHistory[1]*MSADPCMAdaptionCoeff[blockpred][1]) / 256};
-                pred += ((nibble^0x08) - 0x08) * delta;
-                pred  = clampi(pred, -32768, 32767);
-
-                sampleHistory[1] = sampleHistory[0];
-                sampleHistory[0] = pred;
-                samples[j] = pred;
-
-                delta = (MSADPCMAdaption[nibble] * delta) / 256;
-                delta = maxi(16, delta);
             }
 
-            if(skip < todo) [[likely]]
-            {
-                const size_t towrite{minz(todo-skip, samplesToLoad-wrote)};
-                for(size_t j{0};j < towrite;++j)
-                    dstSamples[wrote++] = static_cast<float>(samples[j+skip]) / 32768.0f;
-                if(wrote == samplesToLoad)
-                    return;
-                skip = 0;
-            }
-            else
-                skip -= todo;
+            const size_t towrite{minz(todo, samplesToLoad-wrote)};
+            for(size_t j{0};j < towrite;++j)
+                dstSamples[wrote++] = static_cast<float>(samples[j]) / 32768.0f;
+            if(wrote == samplesToLoad)
+                return;
 
             i += todo;
         }