diff --git a/rust/lance-encoding/src/compression.rs b/rust/lance-encoding/src/compression.rs index 8e2edbaaec4..0e651dc27ed 100644 --- a/rust/lance-encoding/src/compression.rs +++ b/rust/lance-encoding/src/compression.rs @@ -200,6 +200,14 @@ fn try_rle_for_mini_block( let rle_bytes = (estimated_pairs as u128) * ((type_size + 1) as u128); if rle_bytes < raw_bytes { + #[cfg(feature = "bitpacking")] + { + if let Some(bitpack_bytes) = estimate_inline_bitpacking_bytes(data) { + if (bitpack_bytes as u128) < rle_bytes { + return None; + } + } + } return Some(Box::new(RleMiniBlockEncoder::new())); } None @@ -208,19 +216,8 @@ fn try_rle_for_mini_block( fn try_bitpack_for_mini_block(_data: &FixedWidthDataBlock) -> Option> { #[cfg(feature = "bitpacking")] { - use arrow_array::cast::AsArray; - let bits = _data.bits_per_value; - if !matches!(bits, 8 | 16 | 32 | 64) { - return None; - } - - let bit_widths = _data.expect_stat(Stat::BitWidth); - let widths = bit_widths.as_primitive::(); - let too_small = widths.len() == 1 - && InlineBitpacking::min_size_bytes(widths.value(0)) >= _data.data_size(); - - if !too_small { + if estimate_inline_bitpacking_bytes(_data).is_some() { return Some(Box::new(InlineBitpacking::new(bits))); } None @@ -231,6 +228,40 @@ fn try_bitpack_for_mini_block(_data: &FixedWidthDataBlock) -> Option Option { + use arrow_array::cast::AsArray; + + let bits = data.bits_per_value; + if !matches!(bits, 8 | 16 | 32 | 64) { + return None; + } + if data.num_values == 0 { + return None; + } + + let bit_widths = data.expect_stat(Stat::BitWidth); + let widths = bit_widths.as_primitive::(); + + let words_per_chunk: u128 = 1; + let word_bytes: u128 = (bits / 8) as u128; + let mut total_words: u128 = 0; + for i in 0..widths.len() { + let bit_width = widths.value(i) as u128; + let packed_words = (1024u128 * bit_width) / (bits as u128); + total_words = total_words.saturating_add(words_per_chunk.saturating_add(packed_words)); + } + + let estimated_bytes = total_words.saturating_mul(word_bytes); + let raw_bytes = data.data_size() as u128; + + if estimated_bytes >= raw_bytes { + return None; + } + + u64::try_from(estimated_bytes).ok() +} + fn try_bitpack_for_block( data: &FixedWidthDataBlock, ) -> Option<(Box, CompressiveEncoding)> { @@ -1198,6 +1229,47 @@ mod tests { assert!(format!("{:?}", compressor).contains("RleMiniBlockEncoder")); } + #[test] + #[cfg(feature = "bitpacking")] + fn test_low_cardinality_prefers_bitpacking_over_rle() { + let strategy = DefaultCompressionStrategy::new(); + let field = create_test_field("int_score", DataType::Int64); + + // Low cardinality values (3/4/5) but with moderate run count: + // RLE compresses vs raw, yet bitpacking should be smaller. + let mut values: Vec = Vec::with_capacity(256); + for run_idx in 0..64 { + let value = match run_idx % 3 { + 0 => 3u64, + 1 => 4u64, + _ => 5u64, + }; + values.extend(std::iter::repeat_n(value, 4)); + } + + let mut block = FixedWidthDataBlock { + bits_per_value: 64, + data: LanceBuffer::reinterpret_vec(values), + num_values: 256, + block_info: BlockInfo::default(), + }; + + use crate::statistics::ComputeStat; + block.compute_stat(); + + let data = DataBlock::FixedWidth(block); + let compressor = strategy.create_miniblock_compressor(&field, &data).unwrap(); + let debug_str = format!("{:?}", compressor); + assert!( + debug_str.contains("InlineBitpacking"), + "expected InlineBitpacking, got: {debug_str}" + ); + assert!( + !debug_str.contains("RleMiniBlockEncoder"), + "expected RLE to be skipped when bitpacking is smaller, got: {debug_str}" + ); + } + fn check_uncompressed_encoding(encoding: &CompressiveEncoding, variable: bool) { let chain = extract_array_encoding_chain(encoding); if variable { diff --git a/rust/lance-encoding/src/encodings/physical/rle.rs b/rust/lance-encoding/src/encodings/physical/rle.rs index 9ce9d70cbdf..f1cbb455c48 100644 --- a/rust/lance-encoding/src/encodings/physical/rle.rs +++ b/rust/lance-encoding/src/encodings/physical/rle.rs @@ -960,19 +960,43 @@ mod tests { ); metadata_explicit.insert("lance-encoding:bss".to_string(), "off".to_string()); - let mut generator = RleDataGenerator::new(vec![1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]); + let mut generator = RleDataGenerator::new(vec![ + i32::MIN, + i32::MIN, + i32::MIN, + i32::MIN, + i32::MIN + 1, + i32::MIN + 1, + i32::MIN + 1, + i32::MIN + 1, + i32::MIN + 2, + i32::MIN + 2, + i32::MIN + 2, + i32::MIN + 2, + ]); let data_explicit = generator.generate_default(RowCount::from(10000)).unwrap(); check_round_trip_encoding_of_data(vec![data_explicit], &test_cases, metadata_explicit) .await; // 2. Test automatic RLE selection based on data characteristics - // 80% repetition should trigger RLE (> default 50% threshold) + // 80% repetition should trigger RLE (> default 50% threshold). + // + // Use values with the high bit set so bitpacking can't shrink the values. // Explicitly disable BSS to ensure RLE is tested let mut metadata = HashMap::new(); metadata.insert("lance-encoding:bss".to_string(), "off".to_string()); - let mut values = vec![42i32; 8000]; // 80% repetition - values.extend([1i32, 2i32, 3i32, 4i32, 5i32].repeat(400)); // 20% variety + let mut values = vec![i32::MIN; 8000]; // 80% repetition + values.extend( + [ + i32::MIN + 1, + i32::MIN + 2, + i32::MIN + 3, + i32::MIN + 4, + i32::MIN + 5, + ] + .repeat(400), + ); // 20% variety let arr = Arc::new(Int32Array::from(values)) as Arc; check_round_trip_encoding_of_data(vec![arr], &test_cases, metadata).await; }