Skip to content

Commit 0e5d8bc

Browse files
authored
Re-enable file skipping on timestamp columns based on maxValues stat
2 parents 04f412b + 7e289bf commit 0e5d8bc

File tree

3 files changed

+36
-43
lines changed

3 files changed

+36
-43
lines changed

kernel/src/kernel_predicates/mod.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,26 @@ pub trait DataSkippingPredicateEvaluator {
833833
inverted: bool,
834834
) -> Option<Self::Output> {
835835
let max = self.get_max_stat(col, &val.data_type())?;
836+
837+
// Delta Lake min/max stats are stored with millisecond precision (truncated, not rounded up) so we can't use direct comparison.
838+
// For Ordering::Greater comparison we adjust timestamp values by subtracting 999 microseconds from the value to ensure
839+
// that comparisons against max stats are correct. Any rows that pass this filter will be re-evaluated later for exact matches.
840+
// See:
841+
// - https://github.com/delta-io/delta-kernel-rs/issues/1002
842+
// - https://github.com/delta-io/delta-kernel-rs/pull/1003
843+
if matches!(val, Scalar::Timestamp(_) | Scalar::TimestampNtz(_)) {
844+
if !inverted && ord == Ordering::Greater {
845+
let max_ts_adjusted = timestamp_subtract(val, 999);
846+
tracing::debug!(
847+
"Adjusted timestamp value for col {col} for max stat comparison from {val:?} to {max_ts_adjusted:?}"
848+
);
849+
return self.eval_partial_cmp(ord, max, &max_ts_adjusted, inverted);
850+
}
851+
852+
// Disabled by default: https://github.com/delta-io/delta-kernel-rs/issues/1002
853+
return None;
854+
}
855+
836856
self.eval_partial_cmp(ord, max, val, inverted)
837857
}
838858

@@ -977,3 +997,12 @@ impl<T: DataSkippingPredicateEvaluator + ?Sized> KernelPredicateEvaluator for T
977997
self.finish_eval_pred_junction(op, preds, inverted)
978998
}
979999
}
1000+
1001+
/// Adjust timestamp value by subtracting the given interval in microseconds.
1002+
fn timestamp_subtract(val: &Scalar, interval_micros: i64) -> Scalar {
1003+
match val {
1004+
Scalar::Timestamp(ts) => Scalar::Timestamp(*ts - interval_micros),
1005+
Scalar::TimestampNtz(ts) => Scalar::TimestampNtz(*ts - interval_micros),
1006+
_ => val.clone(),
1007+
}
1008+
}

kernel/src/scan/data_skipping.rs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -201,14 +201,8 @@ impl DataSkippingPredicateEvaluator for DataSkippingPredicateCreator {
201201
Some(joined_column_expr!("minValues", col))
202202
}
203203

204-
/// Retrieves the maximum value of a column, if it exists and has the requested type.
205-
// TODO(#1002): we currently don't support file skipping on timestamp columns' max stat since
206-
// they are truncated to milliseconds in add.stats.
207-
fn get_max_stat(&self, col: &ColumnName, data_type: &DataType) -> Option<Expr> {
208-
match data_type {
209-
&DataType::TIMESTAMP | &DataType::TIMESTAMP_NTZ => None,
210-
_ => Some(joined_column_expr!("maxValues", col)),
211-
}
204+
fn get_max_stat(&self, col: &ColumnName, _data_type: &DataType) -> Option<Expr> {
205+
Some(joined_column_expr!("maxValues", col))
212206
}
213207

214208
/// Retrieves the null count of a column, if it exists.

kernel/src/scan/data_skipping/tests.rs

Lines changed: 5 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -340,39 +340,9 @@ fn test_sql_where() {
340340
do_test(ALL_NULL, pred, MISSING, None, None);
341341
}
342342

343-
// TODO(#1002): we currently don't support file skipping on timestamp columns' max stat since they
344-
// are truncated to milliseconds in add.stats.
345-
#[test]
346-
fn test_timestamp_skipping_disabled() {
347-
let creator = DataSkippingPredicateCreator;
348-
let col = &column_name!("timestamp_col");
349-
350-
assert!(
351-
creator.get_min_stat(col, &DataType::TIMESTAMP).is_some(),
352-
"get_min_stat should return Some: allow data skipping on timestamp minValues"
353-
);
354-
assert_eq!(
355-
creator.get_max_stat(col, &DataType::TIMESTAMP),
356-
None,
357-
"get_max_stat should return None: no data skipping on timestamp maxValues"
358-
);
359-
assert!(
360-
creator
361-
.get_min_stat(col, &DataType::TIMESTAMP_NTZ)
362-
.is_some(),
363-
"get_min_stat should return Some: allow data skipping on timestamp_ntz minValues"
364-
);
365-
assert_eq!(
366-
creator.get_max_stat(col, &DataType::TIMESTAMP_NTZ),
367-
None,
368-
"get_max_stat should return None: no data skipping on timestamp_ntz maxValues"
369-
);
370-
}
371343

372-
// TODO(#1002): we currently don't support file skipping on timestamp columns' max stat since they
373-
// are truncated to milliseconds in add.stats.
374344
#[test]
375-
fn test_timestamp_predicates_dont_data_skip() {
345+
fn test_timestamp_predicates_data_skip() {
376346
let col = &column_expr!("ts_col");
377347
for timestamp in [&Scalar::Timestamp(1000000), &Scalar::TimestampNtz(1000000)] {
378348
// LT will do minValues -> OK
@@ -383,12 +353,12 @@ fn test_timestamp_predicates_dont_data_skip() {
383353
"Column(minValues.ts_col) < 1000000"
384354
);
385355

386-
// GT will do maxValues -> BLOCKED
356+
// GT will adjust predicate value for maxValues comparison
387357
let pred = Pred::gt(col.clone(), timestamp.clone());
388358
let skipping_pred = as_data_skipping_predicate(&pred);
389-
assert!(
390-
skipping_pred.is_none(),
391-
"Expected no data skipping for timestamp predicate: {pred:#?}, got {skipping_pred:#?}"
359+
assert_eq!(
360+
skipping_pred.unwrap().to_string(),
361+
"Column(maxValues.ts_col) > 999001"
392362
);
393363

394364
let pred = Pred::eq(col.clone(), timestamp.clone());

0 commit comments

Comments
 (0)