From 77ca076865041cccb05750120275f6756b053251 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Sun, 7 Sep 2025 00:11:18 -0700 Subject: [PATCH 1/3] Fix the DateTimeParseException --- .../transform/DataTypesTransformations.scala | 47 +++++++++++++++- .../geopackage/test_datetime_issue.gpkg | Bin 0 -> 53248 bytes .../sedona/sql/GeoPackageReaderTest.scala | 52 ++++++++++++++++-- .../sedona/sql/GeoPackageReaderTest.scala | 44 +++++++++++++++ .../sedona/sql/GeoPackageReaderTest.scala | 44 +++++++++++++++ 5 files changed, 181 insertions(+), 6 deletions(-) create mode 100644 spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala index 9a23f0a088a..2207194157a 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala @@ -18,8 +18,9 @@ */ package org.apache.sedona.sql.datasources.geopackage.transform -import java.time.{Instant, LocalDate} +import java.time.{Instant, LocalDate, LocalDateTime, ZoneOffset} import java.time.format.DateTimeFormatter +import java.time.format.DateTimeParseException import java.time.temporal.ChronoUnit object DataTypesTransformations { @@ -34,6 +35,48 @@ object DataTypesTransformations { } def epoch(timestampStr: String): Long = { - Instant.parse(timestampStr).toEpochMilli + try { + // Try parsing as-is first (works for timestamps with timezone info) + Instant.parse(timestampStr).toEpochMilli + } catch { + case _: DateTimeParseException => + // If parsing fails, try treating it as UTC (common case for GeoPackage) + try { + // Handle various datetime formats without timezone info + // Try different patterns to handle various millisecond formats + val patterns = Array( + "yyyy-MM-dd'T'HH:mm:ss.SSS", // 3 digits + "yyyy-MM-dd'T'HH:mm:ss.SS", // 2 digits + "yyyy-MM-dd'T'HH:mm:ss.S", // 1 digit + "yyyy-MM-dd'T'HH:mm:ss" // no milliseconds + ) + + var localDateTime: LocalDateTime = null + var lastException: DateTimeParseException = null + + for (pattern <- patterns) { + try { + val formatter = DateTimeFormatter.ofPattern(pattern) + localDateTime = LocalDateTime.parse(timestampStr, formatter) + lastException = null + } catch { + case e: DateTimeParseException => + lastException = e + } + } + + if (localDateTime != null) { + localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli + } else { + throw lastException + } + } catch { + case e: DateTimeParseException => + throw new IllegalArgumentException( + s"Unable to parse datetime: $timestampStr. " + + s"Expected formats: 'yyyy-MM-ddTHH:mm:ss[.S]' or 'yyyy-MM-ddTHH:mm:ss[.S]Z'", + e) + } + } } } diff --git a/spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg b/spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg new file mode 100644 index 0000000000000000000000000000000000000000..f53a11fe087481796c82c0f12898b5bcbe8302a9 GIT binary patch literal 53248 zcmeI)%WvXF90zcd2W0bDa;ea+RCPofk!T?@213ZD+NO!aqD@EwMr|5J76n7Bn5_B*HCoE*JM1Asoj|)4wJ9*O@%@$xLU4{&#U);CFLAdHgDAE4M$L=GNt>OWwb^ z#i?U%=_l{ci(3n|x!)E(cHhkXIQ@rvbLx2dt$hJJf7k+d=>?DP*)vx&YvjMKDCNfM zvaDPiN=;YmHQh10xSJ7oB#}rvpC5?CQAt+kiEXl0C)e^-g-GI;Qjn=E)GL?O+MwF| z_PYTjeI$`|?%;r3Pf;nUHPxV-YBe$-N-xwKt*sakuzQrwN|~KhS|VjdHVO*KrBf$4 zk*s$2B1mi-9VF~FuHEn5+NXxyB^SzB7Zn0NcKp`mGMUr#G9Z6yoM#~zO_7IS_7qHkiFEjyFbESnpepk~CFNowb1 z6?ILQ)#8Y(s;V{ldU)TPAvYsJ^4BBRLPJKZh3)pTQPtn8-wX8`9j6W5-r=lFpN1$F zn^`Tr-mKCac|*-tWKAi_`i;KYj_VF|$vL<0EH-vV(VlLPcDAfC`~Ok%aMt6C$6ap{ z-OMU0^{QfMH*#yxZJmgmh3?+ma)ykVTr8VypdWTxxo^w3X(%I4Rbyr8Es;{`Q}aC3 ze|V~nsJC@E9hA%sGy!${vBM5cwp|@MHkAvqQDYOQJ5=>1&%aL1g8bhlX3FjMoyUhJ z=6%<-)2p4C&SKAZsTq$i5OCd{SbJ+Ix*?a8ym6_~XZD#{CKZ_~M+(|TD%%NzO~64{ zO&V=lDa!Q1e~(^lrP*XXmuc-N6dMK5S-<9-{uw#`pm zR-U}^bZtAwfJN4snd;T-R5=*ChZj2r4K|{EpO?+Ch+Tk%REzYKp_Wudv!d8a=MJ*9 zl|0Sf;&0svkU%X9k+Ph4jVuAn!AOHafKmY;|fB*y_009U*0D!$QB6nZ?^KX6bOhrRhig2NYXB@VEQ`x)qxU0SG_<0uX=z1Rwwb z2tWV=5P-mY3z*;kr$73S2?7v+00bZa0SG_<0uX=z1Rwx`2^PTp{|W9~>>30h009U< z00Izz00bZa0SG{#Er9#~NCgN$00Izz00bZa0SG_<0uX?}mI00Izz00bZa0SG_<0uX?} + val createdTimestamp = row.getAs[Timestamp]("created_at") + val updatedTimestamp = row.getAs[Timestamp]("updated_at") + createdTimestamp should not be null + updatedTimestamp should not be null + createdTimestamp.getTime should be > 0L + updatedTimestamp.getTime should be > 0L + } + + // Test showMetadata option with the same file + noException should be thrownBy { + val metadataDf = sparkSession.read + .format("geopackage") + .option("showMetadata", "true") + .load(testFilePath) + metadataDf.select("last_change").collect() + } + } } describe("GeoPackage Raster Data Test") { @@ -257,7 +301,7 @@ class GeoPackageReaderTest extends TestBaseScala with Matchers { .load(inputPath) .count shouldEqual 34 - val df = sparkSessionMinio.read + val df = sparkSession.read .format("geopackage") .option("tableName", "point1") .load(inputPath) diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala index 9de19c3c483..6d9f41bf4e3 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala @@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with Matchers { df.count() shouldEqual expectedCount } } + + it("should handle datetime fields without timezone information") { + // This test verifies the fix for DateTimeParseException when reading + // GeoPackage files with datetime fields that don't include timezone info + val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg" + + // Test reading the test_features table with problematic datetime formats + val df = sparkSession.read + .format("geopackage") + .option("tableName", "test_features") + .load(testFilePath) + + // The test should not throw DateTimeParseException when reading datetime fields + noException should be thrownBy { + df.select("created_at", "updated_at").collect() + } + + // Verify that datetime fields are properly parsed as TimestampType + df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual TimestampType + df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual TimestampType + + // Verify that we can read the datetime values + val datetimeValues = df.select("created_at", "updated_at").collect() + datetimeValues should not be empty + + // Verify that datetime values are valid timestamps + datetimeValues.foreach { row => + val createdTimestamp = row.getAs[Timestamp]("created_at") + val updatedTimestamp = row.getAs[Timestamp]("updated_at") + createdTimestamp should not be null + updatedTimestamp should not be null + createdTimestamp.getTime should be > 0L + updatedTimestamp.getTime should be > 0L + } + + // Test showMetadata option with the same file + noException should be thrownBy { + val metadataDf = sparkSession.read + .format("geopackage") + .option("showMetadata", "true") + .load(testFilePath) + metadataDf.select("last_change").collect() + } + } } describe("GeoPackage Raster Data Test") { diff --git a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala index 9de19c3c483..6d9f41bf4e3 100644 --- a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala +++ b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala @@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with Matchers { df.count() shouldEqual expectedCount } } + + it("should handle datetime fields without timezone information") { + // This test verifies the fix for DateTimeParseException when reading + // GeoPackage files with datetime fields that don't include timezone info + val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg" + + // Test reading the test_features table with problematic datetime formats + val df = sparkSession.read + .format("geopackage") + .option("tableName", "test_features") + .load(testFilePath) + + // The test should not throw DateTimeParseException when reading datetime fields + noException should be thrownBy { + df.select("created_at", "updated_at").collect() + } + + // Verify that datetime fields are properly parsed as TimestampType + df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual TimestampType + df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual TimestampType + + // Verify that we can read the datetime values + val datetimeValues = df.select("created_at", "updated_at").collect() + datetimeValues should not be empty + + // Verify that datetime values are valid timestamps + datetimeValues.foreach { row => + val createdTimestamp = row.getAs[Timestamp]("created_at") + val updatedTimestamp = row.getAs[Timestamp]("updated_at") + createdTimestamp should not be null + updatedTimestamp should not be null + createdTimestamp.getTime should be > 0L + updatedTimestamp.getTime should be > 0L + } + + // Test showMetadata option with the same file + noException should be thrownBy { + val metadataDf = sparkSession.read + .format("geopackage") + .option("showMetadata", "true") + .load(testFilePath) + metadataDf.select("last_change").collect() + } + } } describe("GeoPackage Raster Data Test") { From 488d449f606538c36c1f24df39a86a99871f3e11 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Sun, 7 Sep 2025 00:20:23 -0700 Subject: [PATCH 2/3] Improve according to the feedback --- .../transform/DataTypesTransformations.scala | 63 +++++++------------ 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala index 2207194157a..03d0360baf8 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala @@ -24,13 +24,18 @@ import java.time.format.DateTimeParseException import java.time.temporal.ChronoUnit object DataTypesTransformations { - def getDays(dateString: String): Int = { - val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd") - - val date = LocalDate.parse(dateString, formatter) + // Pre-created formatters to avoid repeated object creation + private val dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd") + private val datetimeFormatters = Array( + DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS"), // 3 digits + DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SS"), // 2 digits + DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.S"), // 1 digit + DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss") // no milliseconds + ) + def getDays(dateString: String): Int = { + val date = LocalDate.parse(dateString, dateFormatter) val epochDate = LocalDate.of(1970, 1, 1) - ChronoUnit.DAYS.between(epochDate, date).toInt } @@ -41,42 +46,22 @@ object DataTypesTransformations { } catch { case _: DateTimeParseException => // If parsing fails, try treating it as UTC (common case for GeoPackage) - try { - // Handle various datetime formats without timezone info - // Try different patterns to handle various millisecond formats - val patterns = Array( - "yyyy-MM-dd'T'HH:mm:ss.SSS", // 3 digits - "yyyy-MM-dd'T'HH:mm:ss.SS", // 2 digits - "yyyy-MM-dd'T'HH:mm:ss.S", // 1 digit - "yyyy-MM-dd'T'HH:mm:ss" // no milliseconds - ) - - var localDateTime: LocalDateTime = null - var lastException: DateTimeParseException = null - - for (pattern <- patterns) { - try { - val formatter = DateTimeFormatter.ofPattern(pattern) - localDateTime = LocalDateTime.parse(timestampStr, formatter) - lastException = null - } catch { - case e: DateTimeParseException => - lastException = e - } - } - - if (localDateTime != null) { - localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli - } else { - throw lastException + // Handle various datetime formats without timezone info + // Try different patterns to handle various millisecond formats + for (formatter <- datetimeFormatters) { + try { + val localDateTime = LocalDateTime.parse(timestampStr, formatter) + return localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli + } catch { + case _: DateTimeParseException => + // Continue to next formatter } - } catch { - case e: DateTimeParseException => - throw new IllegalArgumentException( - s"Unable to parse datetime: $timestampStr. " + - s"Expected formats: 'yyyy-MM-ddTHH:mm:ss[.S]' or 'yyyy-MM-ddTHH:mm:ss[.S]Z'", - e) } + + // If all formatters failed, throw a descriptive exception + throw new IllegalArgumentException( + s"Unable to parse datetime: $timestampStr. " + + s"Expected formats: 'yyyy-MM-ddTHH:mm:ss[.S]' or 'yyyy-MM-ddTHH:mm:ss[.S]Z'") } } } From 63a167e68cca29c30a5abd019f40ea3c1c47810f Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Sun, 7 Sep 2025 00:26:28 -0700 Subject: [PATCH 3/3] Improve according to the feedback --- .../geopackage/transform/DataTypesTransformations.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala index 03d0360baf8..c0e532b08af 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala @@ -59,9 +59,8 @@ object DataTypesTransformations { } // If all formatters failed, throw a descriptive exception - throw new IllegalArgumentException( - s"Unable to parse datetime: $timestampStr. " + - s"Expected formats: 'yyyy-MM-ddTHH:mm:ss[.S]' or 'yyyy-MM-ddTHH:mm:ss[.S]Z'") + throw new IllegalArgumentException(s"Unable to parse datetime: $timestampStr. " + + s"Expected formats: 'yyyy-MM-ddTHH:mm:ss[.SSS|.SS|.S]' or 'yyyy-MM-ddTHH:mm:ss[.SSS|.SS|.S]Z'") } } }