diff --git a/native/Cargo.lock b/native/Cargo.lock index 78fa3fa124..3cf8ac19ed 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -2428,8 +2428,7 @@ dependencies = [ [[package]] name = "datafusion-spark" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15d28510abfc85709578fcf9065325d43ee3303012c0ccec2dce351bdc577d00" +source = "git+https://github.com/unknowntpo/datafusion?branch=feat-str-to-map-52#c48fe102b17a83f675f3a77e7652507fe5bd6814" dependencies = [ "arrow", "bigdecimal", @@ -2643,7 +2642,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -3495,7 +3494,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -3543,7 +3542,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -4585,7 +4584,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.14.0", + "itertools 0.13.0", "log", "multimap", "petgraph", @@ -4604,7 +4603,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.13.0", "proc-macro2", "quote", "syn 2.0.117", @@ -4706,7 +4705,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.60.2", + "windows-sys 0.59.0", ] [[package]] @@ -5049,7 +5048,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.12.1", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -5584,7 +5583,7 @@ dependencies = [ "getrandom 0.4.1", "once_cell", "rustix 1.1.4", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -6272,7 +6271,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] diff --git a/native/Cargo.toml b/native/Cargo.toml index d5a6aeabc9..9d03d7a051 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -71,3 +71,7 @@ inherits = "release" lto = false # Skip LTO for faster linking codegen-units = 16 # Parallel codegen (faster compile, slightly larger binary) # overflow-checks inherited as false from release + +[patch.crates-io] +datafusion-spark = { git = "https://github.com/unknowntpo/datafusion", branch = "feat-str-to-map-52" } + diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 1030e30aaf..83c0a9059e 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -51,6 +51,7 @@ use datafusion_spark::function::hash::crc32::SparkCrc32; use datafusion_spark::function::hash::sha1::SparkSha1; use datafusion_spark::function::hash::sha2::SparkSha2; use datafusion_spark::function::map::map_from_entries::MapFromEntries; +use datafusion_spark::function::map::str_to_map::SparkStrToMap; use datafusion_spark::function::math::expm1::SparkExpm1; use datafusion_spark::function::math::hex::SparkHex; use datafusion_spark::function::math::width_bucket::SparkWidthBucket; @@ -404,6 +405,7 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkCrc32::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSpace::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitCount::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkStrToMap::default())); } /// Prepares arrow arrays for output. diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index c5880e00ed..a4a2e2b5b7 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -128,7 +128,8 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[MapValues] -> CometMapValues, classOf[MapFromArrays] -> CometMapFromArrays, classOf[MapContainsKey] -> CometMapContainsKey, - classOf[MapFromEntries] -> CometMapFromEntries) + classOf[MapFromEntries] -> CometMapFromEntries, + classOf[StringToMap] -> CometStrToMap) private val structExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[CreateNamedStruct] -> CometCreateNamedStruct, diff --git a/spark/src/main/scala/org/apache/comet/serde/maps.scala b/spark/src/main/scala/org/apache/comet/serde/maps.scala index 34e76215f3..52b89de99b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/maps.scala +++ b/spark/src/main/scala/org/apache/comet/serde/maps.scala @@ -130,3 +130,10 @@ object CometMapFromEntries extends CometScalarFunction[MapFromEntries]("map_from Compatible(None) } } + +object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map") { + + override def getSupportLevel(expr: StringToMap): SupportLevel = { + Compatible(None) + } +} diff --git a/spark/src/test/resources/sql-tests/expressions/map/str_to_map.sql b/spark/src/test/resources/sql-tests/expressions/map/str_to_map.sql new file mode 100644 index 0000000000..8780c65ea2 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/map/str_to_map.sql @@ -0,0 +1,81 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Tests for Spark-compatible str_to_map function +-- https://spark.apache.org/docs/latest/api/sql/index.html#str_to_map +-- +-- Test cases derived from Spark test("StringToMap"): +-- https://github.com/apache/spark/blob/v4.0.0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala#L525-L618 + +-- ConfigMatrix: parquet.enable.dictionary=false,true + +-- s0: Basic test with default delimiters +query spark_answer_only +SELECT str_to_map('a:1,b:2,c:3') + +-- s1: Preserve spaces in values +query spark_answer_only +SELECT str_to_map('a: ,b:2') + +-- s2: Custom key-value delimiter '=' +query spark_answer_only +SELECT str_to_map('a=1,b=2,c=3', ',', '=') + +-- s3: Empty string returns map with empty key and NULL value +query spark_answer_only +SELECT str_to_map('', ',', '=') + +-- s4: Custom pair delimiter '_' +query spark_answer_only +SELECT str_to_map('a:1_b:2_c:3', '_', ':') + +-- s5: Single key without value returns NULL value +query spark_answer_only +SELECT str_to_map('a') + +-- s6: Custom delimiters '&' and '=' +query spark_answer_only +SELECT str_to_map('a=1&b=2&c=3', '&', '=') + +-- Duplicate keys: EXCEPTION policy (Spark 3.0+ default) +-- TODO: Add LAST_WIN policy tests when spark.sql.mapKeyDedupPolicy config is supported +-- query spark_answer_only +-- SELECT str_to_map('a:1,b:2,a:3') + +-- NULL input returns NULL +query spark_answer_only +SELECT str_to_map(NULL, ',', ':') + +-- Explicit 3-arg form +query spark_answer_only +SELECT str_to_map('a:1,b:2,c:3', ',', ':') + +-- Missing key-value delimiter results in NULL value +query spark_answer_only +SELECT str_to_map('a,b:2', ',', ':') + +-- Multi-row test +query spark_answer_only +SELECT str_to_map(col) FROM (VALUES ('a:1,b:2'), ('x:9'), (NULL)) AS t(col) + +-- Multi-row with custom delimiter +query spark_answer_only +SELECT str_to_map(col, ',', '=') FROM (VALUES ('a=1,b=2'), ('x=9'), (NULL)) AS t(col) + +-- Per-row delimiters: each row can have different delimiters +query spark_answer_only +SELECT str_to_map(col1, col2, col3) FROM (VALUES ('a=1,b=2', ',', '='), ('x#9', ',', '#'), (NULL, ',', '=')) AS t(col1, col2, col3)