From cef9462db9dd6b2b6513184d388894cf06bf4dae Mon Sep 17 00:00:00 2001 From: Alexander Alexandrov Date: Tue, 17 Mar 2026 14:41:24 +0200 Subject: [PATCH] fix(sql): fix a bug when planning semi- or antijoins Currently, the `exclude_using_columns` called from `expand_wildcard` doesn't consider the filtering semantics of semi- and antijoins when expanding wildcards on top of joins defined via `USING()` syntax. From each set of columns equated by a `USING()` expression, the code currently (1) sorts the set entries, and (2) retains only the first entry from each set. Because of that, the columns surviving the `exclude_using_columns` call might be wrongly chosen from the filtering side if the table qualifier from that side is lexicographically before the filtered side qualifier. For example, given this schema of two identical tables: ```sql create table s(x1 int, x2 int, x3 int); create table t(x1 int, x2 int, x3 int); ``` One would expect that the schema of queries where the `s` and `t` names are swapped will be identical. However, currently this is not the case: ```sql -- Q1 schema: x1 int, x2 int, x3 int (because s < t) select * from s left semi join t using (x1); -- Q2 schema: x2 int, x3 int (because t < s) select * from t left semi join s using (x1); ``` This commit fixes the issue and adds some regression tests. --- datafusion/expr/src/utils.rs | 7 ++++- datafusion/sql/tests/sql_integration.rs | 34 +++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 81a6fd393a989..94807d7c8bdaa 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -386,12 +386,17 @@ fn get_exprs_except_skipped( /// (once for each join side), but an unqualified wildcard should include it only once. /// This function returns the columns that should be excluded. fn exclude_using_columns(plan: &LogicalPlan) -> Result> { + let output_columns: HashSet<_> = plan.schema().columns().iter().cloned().collect(); let using_columns = plan.using_columns()?; let excluded = using_columns .into_iter() // For each USING JOIN condition, only expand to one of each join column in projection .flat_map(|cols| { - let mut cols = cols.into_iter().collect::>(); + // Only consider columns that survive in the output schema. + let mut cols = cols + .into_iter() + .filter(|c| output_columns.contains(c)) + .collect::>(); // sort join columns to make sure we consistently keep the same // qualified column cols.sort(); diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 29c17be69ce5f..731a06b48e125 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -4847,6 +4847,40 @@ fn test_using_join_wildcard_schema() { ); } +#[test] +fn test_using_join_wildcard_schema_semi_anti() { + let left_expected = vec!["s.x1".to_string(), "s.x2".to_string(), "s.x3".to_string()]; + let right_expected = vec!["t.x1".to_string(), "t.x2".to_string(), "t.x3".to_string()]; + + let sql = "WITH + s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3), + t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3) + SELECT * FROM s LEFT SEMI JOIN t USING (x1)"; + let plan = logical_plan(sql).unwrap(); + assert_eq!(plan.schema().field_names(), left_expected); + + let sql = "WITH + s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3), + t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3) + SELECT * FROM s RIGHT SEMI JOIN t USING (x1)"; + let plan = logical_plan(sql).unwrap(); + assert_eq!(plan.schema().field_names(), right_expected); + + let sql = "WITH + s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3), + t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3) + SELECT * FROM s LEFT ANTI JOIN t USING (x1)"; + let plan = logical_plan(sql).unwrap(); + assert_eq!(plan.schema().field_names(), left_expected); + + let sql = "WITH + s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3), + t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3) + SELECT * FROM s RIGHT ANTI JOIN t USING (x1)"; + let plan = logical_plan(sql).unwrap(); + assert_eq!(plan.schema().field_names(), right_expected); +} + #[test] fn test_2_nested_lateral_join_with_the_deepest_join_referencing_the_outer_most_relation() {