Skip to content

Commit 18f41cd

Browse files
committed
Rewrite layout of BWT metadata
1 parent 93df56c commit 18f41cd

File tree

2 files changed

+92
-73
lines changed

2 files changed

+92
-73
lines changed

src/transform/BWTBlockCodec.cpp

Lines changed: 85 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@ limitations under the License.
1515

1616
#include <cstring>
1717
#include "BWTBlockCodec.hpp"
18+
#include "../Global.hpp"
1819

1920
using namespace kanzi;
2021

2122

2223
BWTBlockCodec::BWTBlockCodec(Context& ctx)
2324
{
2425
_pBWT = new BWT(ctx);
26+
_bsVersion = ctx.getInt("bsVersion");
2527
}
2628

2729
// Return true if the compression chain succeeded. In this case, the input data
@@ -43,71 +45,50 @@ bool BWTBlockCodec::forward(SliceArray<byte>& input, SliceArray<byte>& output, i
4345
if (output._length - output._index < getMaxEncodedLength(blockSize))
4446
return false;
4547

46-
byte* p0 = &output._array[output._index];
47-
const int chunks = BWT::getBWTChunks(blockSize);
48-
int log = 1;
49-
50-
while (1 << log <= blockSize)
51-
log++;
52-
53-
// Estimate header size based on block size
54-
const int headerSizeBytes1 = chunks * ((2 + log + 7) >> 3);
55-
output._index += headerSizeBytes1;
56-
57-
// Apply forward transform
58-
if (_pBWT->forward(input, output, blockSize) == false)
59-
return false;
48+
int logBlockSize = Global::_log2(uint32(blockSize));
6049

61-
int headerSizeBytes2 = 0;
50+
if ((blockSize & (blockSize - 1)) != 0)
51+
logBlockSize++;
6252

63-
for (int i = 0; i < chunks; i++) {
64-
int primaryIndex = _pBWT->getPrimaryIndex(i);
65-
int pIndexSizeBits = 6;
53+
const int pIndexSize = (logBlockSize + 7) >> 3;
6654

67-
while ((1 << pIndexSizeBits) <= primaryIndex)
68-
pIndexSizeBits++;
55+
if ((pIndexSize <= 0) || (pIndexSize >= 5))
56+
return false;
6957

70-
// Compute block size based on primary index
71-
headerSizeBytes2 += ((2 + pIndexSizeBits + 7) >> 3);
72-
}
73-
74-
if (headerSizeBytes2 != headerSizeBytes1) {
75-
// Adjust space for header
76-
memmove(&p0[headerSizeBytes2], &p0[headerSizeBytes1], blockSize);
77-
output._index = output._index - headerSizeBytes1 + headerSizeBytes2;
78-
}
58+
const int chunks = BWT::getBWTChunks(blockSize);
59+
const int logNbChunks = Global::_log2(uint32(chunks));
7960

80-
int idx = 0;
61+
if (logNbChunks > 7)
62+
return false;
8163

82-
for (int i = 0; i < chunks; i++) {
83-
int primaryIndex = _pBWT->getPrimaryIndex(i);
84-
int pIndexSizeBits = 6;
64+
byte* dst = &output._array[output._index];
65+
output._index += (1 + chunks * pIndexSize);
8566

86-
while ((1 << pIndexSizeBits) <= primaryIndex)
87-
pIndexSizeBits++;
67+
// Apply forward transform
68+
if (_pBWT->forward(input, output, blockSize) == false)
69+
return false;
8870

89-
// Compute primary index size
90-
const int pIndexSizeBytes = (2 + pIndexSizeBits + 7) >> 3;
71+
const byte mode = byte((logNbChunks << 2) | (pIndexSize - 1));
9172

92-
// Write block header (mode + primary index). See top of header file for format
93-
int shift = (pIndexSizeBytes - 1) << 3;
94-
int blockMode = (pIndexSizeBits + 1) >> 3;
95-
blockMode = (blockMode << 6) | ((primaryIndex >> shift) & 0x3F);
96-
p0[idx++] = byte(blockMode);
73+
// Emit header
74+
for (int i = 0, idx = 1; i < chunks; i++) {
75+
const int primaryIndex = _pBWT->getPrimaryIndex(i) - 1;
76+
int shift = (pIndexSize - 1) << 3;
9777

98-
while (shift >= 8) {
78+
while (shift >= 0) {
79+
dst[idx++] = byte(primaryIndex >> shift);
9980
shift -= 8;
100-
p0[idx++] = byte(primaryIndex >> shift);
10181
}
10282
}
10383

84+
dst[0] = mode;
10485
return true;
10586
}
10687

10788
bool BWTBlockCodec::inverse(SliceArray<byte>& input, SliceArray<byte>& output, int blockSize)
10889
{
109-
if (blockSize == 0)
110-
return true;
90+
if (blockSize <= 1)
91+
return blockSize == 0;
11192

11293
if (!SliceArray<byte>::isValid(input))
11394
throw std::invalid_argument("BWTBlockCodec: Invalid input block");
@@ -118,30 +99,71 @@ bool BWTBlockCodec::inverse(SliceArray<byte>& input, SliceArray<byte>& output, i
11899
if (input._array == output._array)
119100
return false;
120101

121-
const int chunks = BWT::getBWTChunks(blockSize);
102+
if (_bsVersion > 5) {
103+
// Number of chunks and primary index size in bitstream since bsVersion 6
104+
const byte* src = &input._array[input._index];
105+
byte mode = src[0];
106+
const uint logNbChunks = uint(mode >> 2) & 0x07;
107+
const int pIndexSize = (int(mode) & 0x03) + 1;
122108

123-
for (int i = 0; i < chunks; i++) {
124-
// Read block header (mode + primary index). See top of header file for format
125-
const int blockMode = int(input._array[input._index++]);
126-
const int pIndexSizeBytes = 1 + ((blockMode >> 6) & 0x03);
109+
if (pIndexSize == 0)
110+
return false;
127111

128-
if (blockSize < pIndexSizeBytes)
129-
return false;
112+
const int chunks = 1 << logNbChunks;
130113

131-
blockSize -= pIndexSizeBytes;
132-
int shift = (pIndexSizeBytes - 1) << 3;
133-
int primaryIndex = (blockMode & 0x3F) << shift;
114+
if (chunks != BWT::getBWTChunks(blockSize))
115+
return false;
134116

135-
// Extract BWT primary index
136-
for (int n = 1; n < pIndexSizeBytes; n++) {
137-
shift -= 8;
138-
primaryIndex |= (int(input._array[input._index++]) << shift);
139-
}
117+
const int headerSize = 1 + chunks * pIndexSize;
140118

141-
if (_pBWT->setPrimaryIndex(i, primaryIndex) == false)
142-
return false;
119+
if ((input._length < headerSize) || (blockSize < headerSize))
120+
return false;
121+
122+
// Read header
123+
for (int i = 0, idx = 1; i < chunks; i++) {
124+
int shift = (pIndexSize - 1) << 3;
125+
int primaryIndex = 0;
126+
127+
// Extract BWT primary index
128+
while (shift >= 0) {
129+
primaryIndex = (primaryIndex << 8) | int(src[idx++]);
130+
shift -= 8;
131+
}
132+
133+
if (_pBWT->setPrimaryIndex(i, primaryIndex + 1) == false)
134+
return false;
135+
}
136+
137+
input._index += headerSize;
138+
blockSize -= headerSize;
139+
}
140+
else {
141+
const int chunks = BWT::getBWTChunks(blockSize);
142+
143+
for (int i = 0; i < chunks; i++) {
144+
// Read block header (mode + primary index). See top of header file for format
145+
const int blockMode = int(input._array[input._index++]);
146+
const int pIndexSizeBytes = 1 + ((blockMode >> 6) & 0x03);
147+
148+
if (blockSize < pIndexSizeBytes)
149+
return false;
150+
151+
blockSize -= pIndexSizeBytes;
152+
int shift = (pIndexSizeBytes - 1) << 3;
153+
int primaryIndex = (blockMode & 0x3F) << shift;
154+
155+
// Extract BWT primary index
156+
for (int n = 1; n < pIndexSizeBytes; n++) {
157+
shift -= 8;
158+
primaryIndex |= (int(input._array[input._index++]) << shift);
159+
}
160+
161+
if (_pBWT->setPrimaryIndex(i, primaryIndex) == false)
162+
return false;
163+
}
143164
}
144165

145166
// Apply inverse Transform
146167
return _pBWT->inverse(input, output, blockSize);
147168
}
169+

src/transform/BWTBlockCodec.hpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,12 @@ namespace kanzi {
2525

2626
// Utility class to en/de-code a BWT data block and its associated primary index(es)
2727

28-
// BWT stream format: Header (m bytes) Data (n bytes)
29-
// Header: For each primary index,
30-
// mode (8 bits) + primary index (8,16 or 24 bits)
31-
// mode: bits 7-6 contain the size in bits of the primary index :
32-
// 00: primary index size <= 6 bits (fits in mode byte)
33-
// 01: primary index size <= 14 bits (1 extra byte)
34-
// 10: primary index size <= 22 bits (2 extra bytes)
35-
// 11: primary index size > 22 bits (3 extra bytes)
36-
// bits 5-0 contain 6 most significant bits of primary index
37-
// primary index: remaining bits (up to 3 bytes)
28+
// BWT stream format: Header (mode + primary index(es)) | Data (n bytes)
29+
// mode (8 bits): xxxyyyzz
30+
// xxx: ignored
31+
// yyy: log(chunks)
32+
// zz: primary index size - 1 (in bytes)
33+
// primary indexes (chunks * (8|16|24|32 bits))
3834

3935
class BWTBlockCodec FINAL : public Transform<byte> {
4036
public:
@@ -55,6 +51,7 @@ namespace kanzi {
5551

5652
private:
5753
BWT* _pBWT;
54+
int _bsVersion;
5855
};
5956
}
6057
#endif

0 commit comments

Comments
 (0)