From bacf7f87c5cc76f56de0d925cde2bbbcde800818 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 15 Apr 2025 14:13:35 -0400 Subject: [PATCH 1/2] Add slt tests for datafusion.execution.parquet.coerce_int96 setting --- .../sqllogictest/test_files/parquet.slt | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 2970b2effb3e..5ff340bb8260 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -629,3 +629,79 @@ physical_plan statement ok drop table foo + + +# Tests for int96 timestamps written by spark +# See https://github.com/apache/datafusion/issues/9981 + +statement ok +CREATE EXTERNAL TABLE int96_from_spark +STORED AS PARQUET +LOCATION '../../parquet-testing/data/int96_from_spark.parquet'; + +# by default the value is read as nanosecond precision +query TTT +describe int96_from_spark +---- +a Timestamp(Nanosecond, None) YES + +# Note that the values are read as nanosecond precision +query P +select * from int96_from_spark +---- +2024-01-01T20:34:56.123456 +2024-01-01T01:00:00 +1816-03-29T08:56:08.066277376 +2024-12-30T23:00:00 +NULL +1815-11-08T16:01:01.191053312 + +statement ok +drop table int96_from_spark; + +# Enable coercion of int96 to microseconds +statement ok +set datafusion.execution.parquet.coerce_int96 = ms + +statement ok +CREATE EXTERNAL TABLE int96_from_spark +STORED AS PARQUET +LOCATION '../../parquet-testing/data/int96_from_spark.parquet'; + +# The value should be read as MICROSECOND precision +# see https://github.com/apache/datafusion/issues/15721 +query TTT +describe int96_from_spark +---- +a Timestamp(Nanosecond, None) YES + +# Per https://github.com/apache/parquet-testing/blob/6e851ddd768d6af741c7b15dc594874399fc3cff/data/int96_from_spark.md?plain=1#L37 +# these values should be +# +# Some("2024-01-01T12:34:56.123456"), +# Some("2024-01-01T01:00:00Z"), +# Some("9999-12-31T01:00:00-02:00"), +# Some("2024-12-31T01:00:00+02:00"), +# None, +# Some("290000-12-31T01:00:00+02:00")) +# +# However, printing the large dates (9999-12-31 and 290000-12-31) is not supported by +# arrow yet +# +# See https://github.com/apache/arrow-rs/issues/7287 +query P +select * from int96_from_spark +---- +2024-01-01T20:34:56.123 +2024-01-01T01:00:00 +NULL +2024-12-30T23:00:00 +NULL +NULL + +# Cleanup / reset default setting +statement ok +drop table int96_from_spark; + +statement ok +set datafusion.execution.parquet.coerce_int96 = ns From 6508ec350dc425ea22394fe002fe70a0ec40dd4c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 28 Apr 2025 11:51:36 -0400 Subject: [PATCH 2/2] tweak --- datafusion/sqllogictest/test_files/parquet.slt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 42a7382935e3..abc6fdab3c8a 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -668,10 +668,9 @@ CREATE EXTERNAL TABLE int96_from_spark STORED AS PARQUET LOCATION '../../parquet-testing/data/int96_from_spark.parquet'; -# The value should be read as MICROSECOND precision -# see https://github.com/apache/datafusion/issues/15721 +# Print schema query TTT -describe int96_from_spark +describe int96_from_spark; ---- a Timestamp(Millisecond, None) YES @@ -704,4 +703,4 @@ statement ok drop table int96_from_spark; statement ok -set datafusion.execution.parquet.coerce_int96 = ns +set datafusion.execution.parquet.coerce_int96 = ns;