Skip to content

Commit 81a8b7d

Browse files
authored
feat!: Add optional stats field to remove action (#1390)
An optional stats field is part of the remove schema. This PR adds it here. While utility of stats can be limited it is important to propagate for remove_actions in transactions to appropriately record number of records removed on a remove action. It is likely also important to not drop this field when doing things like log_compaction. BREAKING CHANGE: Adds a field in the middle of a the remove_file schema.
1 parent 2931ebb commit 81a8b7d

File tree

2 files changed

+92
-7
lines changed

2 files changed

+92
-7
lines changed

kernel/src/actions/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,12 @@ pub(crate) struct Remove {
878878
#[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))]
879879
pub(crate) size: Option<i64>,
880880

881+
/// Contains [statistics] (e.g., count, min/max values for columns) about the data in this logical file encoded as a JSON string.
882+
///
883+
/// [statistics]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Per-file-Statistics
884+
#[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))]
885+
pub stats: Option<String>,
886+
881887
/// Map containing metadata about this logical file.
882888
#[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))]
883889
pub(crate) tags: Option<HashMap<String, String>>,
@@ -1237,6 +1243,7 @@ mod tests {
12371243
StructField::nullable("extendedFileMetadata", DataType::BOOLEAN),
12381244
partition_values_field(),
12391245
StructField::nullable("size", DataType::LONG),
1246+
StructField::nullable("stats", DataType::STRING),
12401247
tags_field(),
12411248
deletion_vector_field(),
12421249
StructField::nullable("baseRowId", DataType::LONG),

kernel/src/actions/visitors.rs

Lines changed: 85 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ impl RemoveVisitor {
178178
getters: &[&'a dyn GetData<'a>],
179179
) -> DeltaResult<Remove> {
180180
require!(
181-
getters.len() == 14,
181+
getters.len() == 15,
182182
Error::InternalError(format!(
183183
"Wrong number of RemoveVisitor getters: {}",
184184
getters.len()
@@ -194,14 +194,14 @@ impl RemoveVisitor {
194194
getters[4].get_opt(row_index, "remove.partitionValues")?;
195195

196196
let size: Option<i64> = getters[5].get_opt(row_index, "remove.size")?;
197+
let stats: Option<String> = getters[6].get_opt(row_index, "remove.stats")?;
198+
// TODO(nick) tags are skipped in getters[7]
197199

198-
// TODO(nick) tags are skipped in getters[6]
199-
200-
let deletion_vector = visit_deletion_vector_at(row_index, &getters[7..])?;
200+
let deletion_vector = visit_deletion_vector_at(row_index, &getters[8..])?;
201201

202-
let base_row_id: Option<i64> = getters[12].get_opt(row_index, "remove.baseRowId")?;
202+
let base_row_id: Option<i64> = getters[13].get_opt(row_index, "remove.baseRowId")?;
203203
let default_row_commit_version: Option<i64> =
204-
getters[13].get_opt(row_index, "remove.defaultRowCommitVersion")?;
204+
getters[14].get_opt(row_index, "remove.defaultRowCommitVersion")?;
205205

206206
Ok(Remove {
207207
path,
@@ -210,6 +210,7 @@ impl RemoveVisitor {
210210
extended_file_metadata,
211211
partition_values,
212212
size,
213+
stats,
213214
tags: None,
214215
deletion_vector,
215216
base_row_id,
@@ -834,7 +835,7 @@ mod tests {
834835
let json_strings: StringArray = vec![
835836
r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#,
836837
r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#,
837-
r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#,
838+
r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452,"stats":"{\"numRecords\":1}"}}"#,
838839
]
839840
.into();
840841
let batch = parse_json_batch(json_strings);
@@ -850,6 +851,7 @@ mod tests {
850851
("c2".to_string(), "c".to_string()),
851852
])),
852853
size: Some(452),
854+
stats: Some(r#"{"numRecords":1}"#.to_string()),
853855
..Default::default()
854856
};
855857
assert_eq!(
@@ -863,6 +865,82 @@ mod tests {
863865
);
864866
}
865867

868+
#[test]
869+
fn test_parse_remove_all_fields_unique() {
870+
// This test verifies that all fields in the Remove action are correctly parsed
871+
// and that each field gets a unique value, ensuring no index collisions
872+
let json_strings: StringArray = vec![
873+
r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
874+
r#"{"metaData":{"id":"test-id","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1670892997849}}"#,
875+
r#"{"remove":{"path":"test-path.parquet","deletionTimestamp":1234567890,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{"part":"value"},"size":9999,"stats":"{\"numRecords\":42}","deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":3},"baseRowId":100,"defaultRowCommitVersion":5}}"#,
876+
]
877+
.into();
878+
let batch = parse_json_batch(json_strings);
879+
let mut remove_visitor = RemoveVisitor::default();
880+
remove_visitor.visit_rows_of(batch.as_ref()).unwrap();
881+
882+
assert_eq!(
883+
remove_visitor.removes.len(),
884+
1,
885+
"Expected exactly one remove action"
886+
);
887+
888+
let remove = &remove_visitor.removes[0];
889+
890+
// Verify each field has the expected unique value
891+
assert_eq!(remove.path, "test-path.parquet", "path mismatch");
892+
assert_eq!(
893+
remove.deletion_timestamp,
894+
Some(1234567890),
895+
"deletion_timestamp mismatch"
896+
);
897+
assert!(!remove.data_change, "data_change mismatch");
898+
assert_eq!(
899+
remove.extended_file_metadata,
900+
Some(true),
901+
"extended_file_metadata mismatch"
902+
);
903+
assert_eq!(
904+
remove.partition_values,
905+
Some(HashMap::from([("part".to_string(), "value".to_string())])),
906+
"partition_values mismatch"
907+
);
908+
assert_eq!(remove.size, Some(9999), "size mismatch");
909+
assert_eq!(
910+
remove.stats,
911+
Some(r#"{"numRecords":42}"#.to_string()),
912+
"stats mismatch"
913+
);
914+
915+
// Verify deletion vector fields
916+
let dv = remove
917+
.deletion_vector
918+
.as_ref()
919+
.expect("deletion_vector should be present");
920+
assert_eq!(
921+
dv.path_or_inline_dv, "vBn[lx{q8@P<9BNH/isA",
922+
"deletion_vector.path_or_inline_dv mismatch"
923+
);
924+
assert_eq!(dv.offset, Some(1), "deletion_vector.offset mismatch");
925+
assert_eq!(
926+
dv.size_in_bytes, 36,
927+
"deletion_vector.size_in_bytes mismatch"
928+
);
929+
assert_eq!(dv.cardinality, 3, "deletion_vector.cardinality mismatch");
930+
931+
// Verify row tracking fields (these would have been incorrect with the bug)
932+
assert_eq!(
933+
remove.base_row_id,
934+
Some(100),
935+
"base_row_id mismatch - check getter index"
936+
);
937+
assert_eq!(
938+
remove.default_row_commit_version,
939+
Some(5),
940+
"default_row_commit_version mismatch - check getter index"
941+
);
942+
}
943+
866944
#[test]
867945
fn test_parse_txn() {
868946
let json_strings: StringArray = vec![

0 commit comments

Comments
 (0)