Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 84 additions & 12 deletions rust/lance-encoding/src/compression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,14 @@ fn try_rle_for_mini_block(
let rle_bytes = (estimated_pairs as u128) * ((type_size + 1) as u128);

if rle_bytes < raw_bytes {
#[cfg(feature = "bitpacking")]
{
if let Some(bitpack_bytes) = estimate_inline_bitpacking_bytes(data) {
if (bitpack_bytes as u128) < rle_bytes {
return None;
}
}
}
return Some(Box::new(RleMiniBlockEncoder::new()));
}
None
Expand All @@ -208,19 +216,8 @@ fn try_rle_for_mini_block(
fn try_bitpack_for_mini_block(_data: &FixedWidthDataBlock) -> Option<Box<dyn MiniBlockCompressor>> {
#[cfg(feature = "bitpacking")]
{
use arrow_array::cast::AsArray;

let bits = _data.bits_per_value;
if !matches!(bits, 8 | 16 | 32 | 64) {
return None;
}

let bit_widths = _data.expect_stat(Stat::BitWidth);
let widths = bit_widths.as_primitive::<UInt64Type>();
let too_small = widths.len() == 1
&& InlineBitpacking::min_size_bytes(widths.value(0)) >= _data.data_size();

if !too_small {
if estimate_inline_bitpacking_bytes(_data).is_some() {
return Some(Box::new(InlineBitpacking::new(bits)));
}
None
Expand All @@ -231,6 +228,40 @@ fn try_bitpack_for_mini_block(_data: &FixedWidthDataBlock) -> Option<Box<dyn Min
}
}

#[cfg(feature = "bitpacking")]
fn estimate_inline_bitpacking_bytes(data: &FixedWidthDataBlock) -> Option<u64> {
use arrow_array::cast::AsArray;

let bits = data.bits_per_value;
if !matches!(bits, 8 | 16 | 32 | 64) {
return None;
}
if data.num_values == 0 {
return None;
}

let bit_widths = data.expect_stat(Stat::BitWidth);
let widths = bit_widths.as_primitive::<UInt64Type>();

let words_per_chunk: u128 = 1;
let word_bytes: u128 = (bits / 8) as u128;
let mut total_words: u128 = 0;
for i in 0..widths.len() {
let bit_width = widths.value(i) as u128;
let packed_words = (1024u128 * bit_width) / (bits as u128);
total_words = total_words.saturating_add(words_per_chunk.saturating_add(packed_words));
}

let estimated_bytes = total_words.saturating_mul(word_bytes);
let raw_bytes = data.data_size() as u128;

if estimated_bytes >= raw_bytes {
return None;
}

u64::try_from(estimated_bytes).ok()
}

fn try_bitpack_for_block(
data: &FixedWidthDataBlock,
) -> Option<(Box<dyn BlockCompressor>, CompressiveEncoding)> {
Expand Down Expand Up @@ -1198,6 +1229,47 @@ mod tests {
assert!(format!("{:?}", compressor).contains("RleMiniBlockEncoder"));
}

#[test]
#[cfg(feature = "bitpacking")]
fn test_low_cardinality_prefers_bitpacking_over_rle() {
let strategy = DefaultCompressionStrategy::new();
let field = create_test_field("int_score", DataType::Int64);

// Low cardinality values (3/4/5) but with moderate run count:
// RLE compresses vs raw, yet bitpacking should be smaller.
let mut values: Vec<u64> = Vec::with_capacity(256);
for run_idx in 0..64 {
let value = match run_idx % 3 {
0 => 3u64,
1 => 4u64,
_ => 5u64,
};
values.extend(std::iter::repeat_n(value, 4));
}

let mut block = FixedWidthDataBlock {
bits_per_value: 64,
data: LanceBuffer::reinterpret_vec(values),
num_values: 256,
block_info: BlockInfo::default(),
};

use crate::statistics::ComputeStat;
block.compute_stat();

let data = DataBlock::FixedWidth(block);
let compressor = strategy.create_miniblock_compressor(&field, &data).unwrap();
let debug_str = format!("{:?}", compressor);
assert!(
debug_str.contains("InlineBitpacking"),
"expected InlineBitpacking, got: {debug_str}"
);
assert!(
!debug_str.contains("RleMiniBlockEncoder"),
"expected RLE to be skipped when bitpacking is smaller, got: {debug_str}"
);
}

fn check_uncompressed_encoding(encoding: &CompressiveEncoding, variable: bool) {
let chain = extract_array_encoding_chain(encoding);
if variable {
Expand Down
32 changes: 28 additions & 4 deletions rust/lance-encoding/src/encodings/physical/rle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -960,19 +960,43 @@ mod tests {
);
metadata_explicit.insert("lance-encoding:bss".to_string(), "off".to_string());

let mut generator = RleDataGenerator::new(vec![1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]);
let mut generator = RleDataGenerator::new(vec![
i32::MIN,
i32::MIN,
i32::MIN,
i32::MIN,
i32::MIN + 1,
i32::MIN + 1,
i32::MIN + 1,
i32::MIN + 1,
i32::MIN + 2,
i32::MIN + 2,
i32::MIN + 2,
i32::MIN + 2,
]);
let data_explicit = generator.generate_default(RowCount::from(10000)).unwrap();
check_round_trip_encoding_of_data(vec![data_explicit], &test_cases, metadata_explicit)
.await;

// 2. Test automatic RLE selection based on data characteristics
// 80% repetition should trigger RLE (> default 50% threshold)
// 80% repetition should trigger RLE (> default 50% threshold).
//
// Use values with the high bit set so bitpacking can't shrink the values.
// Explicitly disable BSS to ensure RLE is tested
let mut metadata = HashMap::new();
metadata.insert("lance-encoding:bss".to_string(), "off".to_string());

let mut values = vec![42i32; 8000]; // 80% repetition
values.extend([1i32, 2i32, 3i32, 4i32, 5i32].repeat(400)); // 20% variety
let mut values = vec![i32::MIN; 8000]; // 80% repetition
values.extend(
[
i32::MIN + 1,
i32::MIN + 2,
i32::MIN + 3,
i32::MIN + 4,
i32::MIN + 5,
]
.repeat(400),
); // 20% variety
let arr = Arc::new(Int32Array::from(values)) as Arc<dyn Array>;
check_round_trip_encoding_of_data(vec![arr], &test_cases, metadata).await;
}
Expand Down