Skip to content

Commit 7d35c3a

Browse files
committed
Implement multi-streams Huffman codec for faster decompression
1 parent 5b2818c commit 7d35c3a

File tree

6 files changed

+354
-129
lines changed

6 files changed

+354
-129
lines changed

src/entropy/EntropyDecoderFactory.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ namespace kanzi {
6464
// Each block is decoded separately
6565
// Rebuild the entropy decoder to reset block statistics
6666
case HUFFMAN_TYPE:
67-
return new HuffmanDecoder(ibs);
67+
return new HuffmanDecoder(ibs, &ctx);
6868

6969
case ANS0_TYPE:
7070
return new ANSRangeDecoder(ibs, 0);

src/entropy/HuffmanDecoder.cpp

Lines changed: 185 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ const int HuffmanDecoder::TABLE_MASK = (1 << DECODING_BATCH_SIZE) - 1;
3030

3131
// The chunk size indicates how many bytes are encoded (per block) before
3232
// resetting the frequency stats.
33-
HuffmanDecoder::HuffmanDecoder(InputBitStream& bitstream, int chunkSize) : _bitstream(bitstream)
33+
HuffmanDecoder::HuffmanDecoder(InputBitStream& bitstream, Context* pCtx, int chunkSize) : _bitstream(bitstream)
3434
{
3535
if (chunkSize < 1024)
3636
throw invalid_argument("Huffman codec: The chunk size must be at least 1024");
@@ -44,6 +44,7 @@ HuffmanDecoder::HuffmanDecoder(InputBitStream& bitstream, int chunkSize) : _bits
4444
_chunkSize = chunkSize;
4545
_buffer = new byte[0];
4646
_bufferSize = 0;
47+
_pCtx = pCtx;
4748
reset();
4849
}
4950

@@ -108,7 +109,7 @@ int HuffmanDecoder::readLengths()
108109
// max(CodeLen) must be <= MAX_SYMBOL_SIZE
109110
bool HuffmanDecoder::buildDecodingTable(int count)
110111
{
111-
// Initialize table with non zero value.
112+
// Initialize table with non zero values.
112113
// If the bitstream is altered, the decoder may access these default table values.
113114
// The number of consumed bits cannot be 0.
114115
memset(_table, 8, sizeof(_table));
@@ -138,6 +139,187 @@ bool HuffmanDecoder::buildDecodingTable(int count)
138139
}
139140

140141
int HuffmanDecoder::decode(byte block[], uint blkptr, uint count)
142+
{
143+
int bsVersion = _pCtx == nullptr ? 6 : _pCtx->getInt("bsVersion", 6);
144+
145+
if (bsVersion < 6)
146+
return decodeV5(block, blkptr, count);
147+
148+
return decodeV6(block, blkptr, count);
149+
}
150+
151+
152+
int HuffmanDecoder::decodeV6(byte block[], uint blkptr, uint count)
153+
{
154+
if (count == 0)
155+
return 0;
156+
157+
const uint minBufSize = 2 * uint(_chunkSize);
158+
159+
if (_bufferSize < minBufSize) {
160+
delete[] _buffer;
161+
_bufferSize = minBufSize;
162+
_buffer = new byte[_bufferSize];
163+
}
164+
165+
uint startChunk = blkptr;
166+
const uint end = blkptr + count;
167+
168+
while (startChunk < end) {
169+
const uint sizeChunk = min(uint(_chunkSize), end - startChunk);
170+
171+
if (sizeChunk < 32) {
172+
// Special case for small chunks
173+
_bitstream.readBits(&block[startChunk], 8 * sizeChunk);
174+
}
175+
else {
176+
// For each chunk, read code lengths, rebuild codes, rebuild decoding table
177+
const int alphabetSize = readLengths();
178+
179+
if (alphabetSize <= 0)
180+
return startChunk - blkptr;
181+
182+
if (alphabetSize == 1) {
183+
// Shortcut for chunks with only one symbol
184+
memset(&block[startChunk], _alphabet[0], size_t(sizeChunk));
185+
}
186+
else {
187+
if (buildDecodingTable(alphabetSize) == false)
188+
return -1;
189+
190+
if (decodeChunk(&block[startChunk], sizeChunk) == false)
191+
break;
192+
}
193+
}
194+
195+
startChunk += sizeChunk;
196+
}
197+
198+
return count;
199+
}
200+
201+
// count is at least 32
202+
bool HuffmanDecoder::decodeChunk(byte block[], uint count)
203+
{
204+
// Read fragment sizes
205+
const int szBits0 = EntropyUtils::readVarInt(_bitstream);
206+
const int szBits1 = EntropyUtils::readVarInt(_bitstream);
207+
const int szBits2 = EntropyUtils::readVarInt(_bitstream);
208+
const int szBits3 = EntropyUtils::readVarInt(_bitstream);
209+
210+
if ((szBits0 < 0) || (szBits1 < 0) || (szBits2 < 0) || (szBits3 < 0))
211+
return false;
212+
213+
memset(_buffer, 0, _bufferSize);
214+
215+
int idx0 = 0 * (_bufferSize / 4);
216+
int idx1 = 1 * (_bufferSize / 4);
217+
int idx2 = 2 * (_bufferSize / 4);
218+
int idx3 = 3 * (_bufferSize / 4);
219+
220+
// Read all compressed data from bitstream
221+
_bitstream.readBits(&_buffer[idx0], szBits0);
222+
_bitstream.readBits(&_buffer[idx1], szBits1);
223+
_bitstream.readBits(&_buffer[idx2], szBits2);
224+
_bitstream.readBits(&_buffer[idx3], szBits3);
225+
226+
// State variables for each of the four parallel streams
227+
uint64 state0 = 0, state1 = 0, state2 = 0, state3 = 0; // bits read from bitstream
228+
uint8 bits0 = 0, bits1 = 0, bits2 = 0, bits3 = 0; // number of available bits in state
229+
uint8 bs0, bs1, bs2, bs3, shift;
230+
231+
#define READ_STATE(shift, state, idx, bits, bs) \
232+
shift = (56 - bits) & -8; \
233+
bs = bits + shift - DECODING_BATCH_SIZE; \
234+
state = (state << shift) | (uint64(BigEndian::readLong64(&_buffer[idx])) >> 1 >> (63 - shift)); /* handle shift = 0 */ \
235+
idx += (shift >> 3);
236+
237+
const int szFrag = count / 4;
238+
byte* block0 = &block[0 * szFrag];
239+
byte* block1 = &block[1 * szFrag];
240+
byte* block2 = &block[2 * szFrag];
241+
byte* block3 = &block[3 * szFrag];
242+
int n = 0;
243+
244+
while (n < szFrag - 4) {
245+
// Fill 64 bits of state from the bitstream for each stream
246+
READ_STATE(shift, state0, idx0, bits0, bs0);
247+
READ_STATE(shift, state1, idx1, bits1, bs1);
248+
READ_STATE(shift, state2, idx2, bits2, bs2);
249+
READ_STATE(shift, state3, idx3, bits3, bs3);
250+
251+
// Decompress 4 symbols per stream
252+
const uint16 val00 = _table[(state0 >> bs0) & TABLE_MASK]; bs0 -= uint8(val00);
253+
const uint16 val10 = _table[(state1 >> bs1) & TABLE_MASK]; bs1 -= uint8(val10);
254+
const uint16 val20 = _table[(state2 >> bs2) & TABLE_MASK]; bs2 -= uint8(val20);
255+
const uint16 val30 = _table[(state3 >> bs3) & TABLE_MASK]; bs3 -= uint8(val30);
256+
const uint16 val01 = _table[(state0 >> bs0) & TABLE_MASK]; bs0 -= uint8(val01);
257+
const uint16 val11 = _table[(state1 >> bs1) & TABLE_MASK]; bs1 -= uint8(val11);
258+
const uint16 val21 = _table[(state2 >> bs2) & TABLE_MASK]; bs2 -= uint8(val21);
259+
const uint16 val31 = _table[(state3 >> bs3) & TABLE_MASK]; bs3 -= uint8(val31);
260+
const uint16 val02 = _table[(state0 >> bs0) & TABLE_MASK]; bs0 -= uint8(val02);
261+
const uint16 val12 = _table[(state1 >> bs1) & TABLE_MASK]; bs1 -= uint8(val12);
262+
const uint16 val22 = _table[(state2 >> bs2) & TABLE_MASK]; bs2 -= uint8(val22);
263+
const uint16 val32 = _table[(state3 >> bs3) & TABLE_MASK]; bs3 -= uint8(val32);
264+
const uint16 val03 = _table[(state0 >> bs0) & TABLE_MASK]; bs0 -= uint8(val03);
265+
const uint16 val13 = _table[(state1 >> bs1) & TABLE_MASK]; bs1 -= uint8(val13);
266+
const uint16 val23 = _table[(state2 >> bs2) & TABLE_MASK]; bs2 -= uint8(val23);
267+
const uint16 val33 = _table[(state3 >> bs3) & TABLE_MASK]; bs3 -= uint8(val33);
268+
269+
bits0 = bs0 + DECODING_BATCH_SIZE;
270+
bits1 = bs1 + DECODING_BATCH_SIZE;
271+
bits2 = bs2 + DECODING_BATCH_SIZE;
272+
bits3 = bs3 + DECODING_BATCH_SIZE;
273+
274+
block0[n + 0] = byte(val00 >> 8);
275+
block1[n + 0] = byte(val10 >> 8);
276+
block2[n + 0] = byte(val20 >> 8);
277+
block3[n + 0] = byte(val30 >> 8);
278+
block0[n + 1] = byte(val01 >> 8);
279+
block1[n + 1] = byte(val11 >> 8);
280+
block2[n + 1] = byte(val21 >> 8);
281+
block3[n + 1] = byte(val31 >> 8);
282+
block0[n + 2] = byte(val02 >> 8);
283+
block1[n + 2] = byte(val12 >> 8);
284+
block2[n + 2] = byte(val22 >> 8);
285+
block3[n + 2] = byte(val32 >> 8);
286+
block0[n + 3] = byte(val03 >> 8);
287+
block1[n + 3] = byte(val13 >> 8);
288+
block2[n + 3] = byte(val23 >> 8);
289+
block3[n + 3] = byte(val33 >> 8);
290+
n += 4;
291+
}
292+
293+
// Fill 64 bits of state from the bitstream for each stream
294+
READ_STATE(shift, state0, idx0, bits0, bs0);
295+
READ_STATE(shift, state1, idx1, bits1, bs1);
296+
READ_STATE(shift, state2, idx2, bits2, bs2);
297+
READ_STATE(shift, state3, idx3, bits3, bs3);
298+
299+
while (n < szFrag) {
300+
// Decompress 1 symbol per stream
301+
const uint16 val0 = _table[(state0 >> bs0) & TABLE_MASK]; bs0 -= uint8(val0);
302+
const uint16 val1 = _table[(state1 >> bs1) & TABLE_MASK]; bs1 -= uint8(val1);
303+
const uint16 val2 = _table[(state2 >> bs2) & TABLE_MASK]; bs2 -= uint8(val2);
304+
const uint16 val3 = _table[(state3 >> bs3) & TABLE_MASK]; bs3 -= uint8(val3);
305+
306+
block0[n] = byte(val0 >> 8);
307+
block1[n] = byte(val1 >> 8);
308+
block2[n] = byte(val2 >> 8);
309+
block3[n] = byte(val3 >> 8);
310+
n++;
311+
}
312+
313+
// Process any remaining bytes at the end of the whole chunk
314+
const uint count4 = 4 * szFrag;
315+
316+
for (uint i = count4; i < count; i++)
317+
block[i] = byte(_bitstream.readBits(8));
318+
319+
return true;
320+
}
321+
322+
int HuffmanDecoder::decodeV5(byte block[], uint blkptr, uint count)
141323
{
142324
if (count == 0)
143325
return 0;
@@ -230,7 +412,7 @@ int HuffmanDecoder::decode(byte block[], uint blkptr, uint count)
230412

231413
// Sanity check
232414
if (bits > 64)
233-
return n;
415+
return n;
234416

235417
uint16 val;
236418

@@ -249,4 +431,3 @@ int HuffmanDecoder::decode(byte block[], uint blkptr, uint count)
249431

250432
return count;
251433
}
252-

src/entropy/HuffmanDecoder.hpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License.
1818
#define _HuffmanDecoder_
1919

2020
#include "HuffmanCommon.hpp"
21+
#include "../Context.hpp"
2122
#include "../EntropyDecoder.hpp"
2223

2324

@@ -28,7 +29,7 @@ namespace kanzi
2829
class HuffmanDecoder : public EntropyDecoder
2930
{
3031
public:
31-
HuffmanDecoder(InputBitStream& bitstream, int chunkSize = HuffmanCommon::MAX_CHUNK_SIZE);
32+
HuffmanDecoder(InputBitStream& bitstream, Context* pCtx = nullptr, int chunkSize = HuffmanCommon::MAX_CHUNK_SIZE) ;
3233

3334
~HuffmanDecoder() { _dispose(); delete[] _buffer; }
3435

@@ -50,13 +51,20 @@ namespace kanzi
5051
uint16 _sizes[256];
5152
uint16 _table[1 << 12]; // decoding table: code -> size, symbol
5253
int _chunkSize;
54+
Context* _pCtx;
5355

5456
int readLengths();
5557

58+
bool decodeChunk(byte block[], uint count);
59+
5660
bool buildDecodingTable(int count);
5761

5862
bool reset();
5963

64+
int decodeV5(byte block[], uint blkptr, uint len);
65+
66+
int decodeV6(byte block[], uint blkptr, uint len);
67+
6068
void _dispose() const {}
6169
};
6270

0 commit comments

Comments
 (0)