diff --git a/Cargo.toml b/Cargo.toml index c17f4f7affaa..55855d09d50e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,21 +77,21 @@ version = "45.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "54.0.0", features = [ +arrow = { version = "54.1.0", features = [ "prettyprint", ] } -arrow-array = { version = "54.0.0", default-features = false, features = [ +arrow-array = { version = "54.1.0", default-features = false, features = [ "chrono-tz", ] } -arrow-buffer = { version = "54.0.0", default-features = false } -arrow-flight = { version = "54.0.0", features = [ +arrow-buffer = { version = "54.1.0", default-features = false } +arrow-flight = { version = "54.1.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "54.0.0", default-features = false, features = [ +arrow-ipc = { version = "54.1.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "54.0.0", default-features = false } -arrow-schema = { version = "54.0.0", default-features = false } +arrow-ord = { version = "54.1.0", default-features = false } +arrow-schema = { version = "54.1.0", default-features = false } async-trait = "0.1.73" bigdecimal = "0.4.7" bytes = "1.4" @@ -133,7 +133,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "54.0.0", default-features = false, features = [ +parquet = { version = "54.1.0", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 4a4d0157c620..a5cf71426607 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -175,9 +175,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ccdcc8fb14508ca20aaec7076032e5c0b0751b906036d4496786e2f227a37a" +checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" dependencies = [ "arrow-arith", "arrow-array", @@ -196,9 +196,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1aad8e27f32e411a0fc0bf5a625a35f0bf9b9f871cf4542abe31f7cef4beea2" +checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" dependencies = [ "arrow-array", "arrow-buffer", @@ -210,9 +210,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd6ed90c28c6f73a706c55799b8cc3a094e89257238e5b1d65ca7c70bd3ae23f" +checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" dependencies = [ "ahash", "arrow-buffer", @@ -227,9 +227,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe4a40bdc1552ea10fbdeae4e5a945d8572c32f66bce457b96c13d9c46b80447" +checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" dependencies = [ "bytes", "half", @@ -238,9 +238,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "430c0a21aa7f81bcf0f97c57216d7127795ea755f494d27bae2bd233be43c2cc" +checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" dependencies = [ "arrow-array", "arrow-buffer", @@ -259,9 +259,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4444c8f8c57ac00e6a679ede67d1ae8872c170797dc45b46f75702437a77888" +checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" dependencies = [ "arrow-array", "arrow-cast", @@ -275,9 +275,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09af476cfbe9879937e50b1334c73189de6039186e025b1b1ac84b283b87b20e" +checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" dependencies = [ "arrow-buffer", "arrow-schema", @@ -287,9 +287,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136296e8824333a8a4c4a6e508e4aa65d5678b801246d0408825ae7b2523c628" +checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" dependencies = [ "arrow-array", "arrow-buffer", @@ -301,9 +301,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e222ad0e419ab8276818c5605a5bb1e35ed86fa8c5e550726433cc63b09c3c78" +checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" dependencies = [ "arrow-array", "arrow-buffer", @@ -321,9 +321,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eddf14c5f03b679ec8ceac4dfac43f63cdc4ed54dab3cc120a4ef46af38481eb" +checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" dependencies = [ "arrow-array", "arrow-buffer", @@ -334,9 +334,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9acdc58da19f383f4ba381fa0e3583534ae2ceb31269aaf4a03f08ff13e8443" +checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" dependencies = [ "arrow-array", "arrow-buffer", @@ -347,15 +347,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a1822a1a952955637e85e8f9d6b0e04dd75d65492b87ec548dd593d3a1f772b" +checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" [[package]] name = "arrow-select" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c4172e9a12dfe15303d3926269f9ead471ea93bdd067d113abc65cb6c48e246" +checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" dependencies = [ "ahash", "arrow-array", @@ -367,9 +367,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73683040445f4932342781926189901c9521bb1a787c35dbe628a3ce51372d3c" +checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" dependencies = [ "arrow-array", "arrow-buffer", @@ -2901,9 +2901,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.0.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3334c50239d9f4951653d84fa6f636da86f53742e5e5849a30fbe852f3ff4383" +checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" dependencies = [ "ahash", "arrow-array", @@ -2927,6 +2927,7 @@ dependencies = [ "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", @@ -3716,6 +3717,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "1.0.1" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 6b6b52fce743..7daa32562173 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -29,7 +29,7 @@ rust-version = "1.81.0" readme = "README.md" [dependencies] -arrow = { version = "54.0.0" } +arrow = { version = "54.1.0" } async-trait = "0.1.0" aws-config = "1.5.0" aws-credential-types = "1.2.0" @@ -57,7 +57,7 @@ home = "=0.5.11" mimalloc = { version = "0.1", default-features = false } object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] } parking_lot = { version = "0.12" } -parquet = { version = "54.0.0", default-features = false } +parquet = { version = "54.1.0", default-features = false } regex = "1.8" rustyline = "15.0" tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] } diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 54344d55bbd1..82735334c7f8 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -309,10 +309,8 @@ impl FileOpener for ArrowOpener { for (dict_block, dict_result) in footer.dictionaries().iter().flatten().zip(dict_results) { - decoder.read_dictionary( - dict_block, - &Buffer::from_bytes(dict_result.into()), - )?; + decoder + .read_dictionary(dict_block, &Buffer::from(dict_result))?; } // filter recordbatches according to range @@ -348,10 +346,7 @@ impl FileOpener for ArrowOpener { .zip(recordbatch_results) .filter_map(move |(block, data)| { decoder - .read_record_batch( - &block, - &Buffer::from_bytes(data.into()), - ) + .read_record_batch(&block, &Buffer::from(data)) .transpose() }), ) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 1ebbf92c736e..eed11f634c9d 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -30,8 +30,8 @@ use arrow::{ record_batch::RecordBatch, }; use arrow_array::{ - Array, BooleanArray, DictionaryArray, Float32Array, Float64Array, Int8Array, - UnionArray, + record_batch, Array, BooleanArray, DictionaryArray, Float32Array, Float64Array, + Int8Array, UnionArray, }; use arrow_buffer::ScalarBuffer; use arrow_schema::{ArrowError, SchemaRef, UnionFields, UnionMode}; @@ -1121,6 +1121,39 @@ async fn join() -> Result<()> { Ok(()) } +#[tokio::test] +async fn join_coercion_unnnamed() -> Result<()> { + let ctx = SessionContext::new(); + + // Test that join will coerce column types when necessary + // even when the relations don't have unique names + let left = ctx.read_batch(record_batch!( + ("id", Int32, [1, 2, 3]), + ("name", Utf8, ["a", "b", "c"]) + )?)?; + let right = ctx.read_batch(record_batch!( + ("id", Int32, [10, 3]), + ("name", Utf8View, ["d", "c"]) // Utf8View is a different type + )?)?; + let cols = vec!["name", "id"]; + + let filter = None; + let join = right.join(left, JoinType::LeftAnti, &cols, &cols, filter)?; + let results = join.collect().await?; + + assert_batches_sorted_eq!( + [ + "+----+------+", + "| id | name |", + "+----+------+", + "| 10 | d |", + "+----+------+", + ], + &results + ); + Ok(()) +} + #[tokio::test] async fn join_on() -> Result<()> { let left = test_table_with_name("a") diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index 993051eaeee1..fc1955705ee0 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -26,6 +26,8 @@ use std::ops::{AddAssign, SubAssign}; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::{ DataType, IntervalDayTime, IntervalMonthDayNano, IntervalUnit, TimeUnit, + MAX_DECIMAL128_FOR_EACH_PRECISION, MAX_DECIMAL256_FOR_EACH_PRECISION, + MIN_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL256_FOR_EACH_PRECISION, }; use datafusion_common::rounding::{alter_fp_rounding_mode, next_down, next_up}; use datafusion_common::{internal_err, Result, ScalarValue}; @@ -97,718 +99,6 @@ macro_rules! get_extreme_value { }; } -/// The maximum `i128` value that can be stored in a `Decimal128` value of precision `p`. -/// -/// Remove this once is available -const MAX_DECIMAL128_FOR_EACH_PRECISION: [i128; 39] = [ - 0, // unused first element - 9, - 99, - 999, - 9999, - 99999, - 999999, - 9999999, - 99999999, - 999999999, - 9999999999, - 99999999999, - 999999999999, - 9999999999999, - 99999999999999, - 999999999999999, - 9999999999999999, - 99999999999999999, - 999999999999999999, - 9999999999999999999, - 99999999999999999999, - 999999999999999999999, - 9999999999999999999999, - 99999999999999999999999, - 999999999999999999999999, - 9999999999999999999999999, - 99999999999999999999999999, - 999999999999999999999999999, - 9999999999999999999999999999, - 99999999999999999999999999999, - 999999999999999999999999999999, - 9999999999999999999999999999999, - 99999999999999999999999999999999, - 999999999999999999999999999999999, - 9999999999999999999999999999999999, - 99999999999999999999999999999999999, - 999999999999999999999999999999999999, - 9999999999999999999999999999999999999, - 99999999999999999999999999999999999999, -]; - -/// The minimum `i128` value that can be stored in a `Decimal128` value of precision `p`. -/// -/// Remove this once is available -const MIN_DECIMAL128_FOR_EACH_PRECISION: [i128; 39] = [ - 0, // unused first element - -9, - -99, - -999, - -9999, - -99999, - -999999, - -9999999, - -99999999, - -999999999, - -9999999999, - -99999999999, - -999999999999, - -9999999999999, - -99999999999999, - -999999999999999, - -9999999999999999, - -99999999999999999, - -999999999999999999, - -9999999999999999999, - -99999999999999999999, - -999999999999999999999, - -9999999999999999999999, - -99999999999999999999999, - -999999999999999999999999, - -9999999999999999999999999, - -99999999999999999999999999, - -999999999999999999999999999, - -9999999999999999999999999999, - -99999999999999999999999999999, - -999999999999999999999999999999, - -9999999999999999999999999999999, - -99999999999999999999999999999999, - -999999999999999999999999999999999, - -9999999999999999999999999999999999, - -99999999999999999999999999999999999, - -999999999999999999999999999999999999, - -9999999999999999999999999999999999999, - -99999999999999999999999999999999999999, -]; - -/// The maximum `i256` value that can be stored in a `Decimal256` value of precision `p`. -/// -/// Remove this once is available -const MAX_DECIMAL256_FOR_EACH_PRECISION: [arrow::datatypes::i256; 77] = [ - arrow::datatypes::i256::from_i128(0_i128), // unused first element - arrow::datatypes::i256::from_le_bytes([ - 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 231, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 15, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 159, 134, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 63, 66, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 127, 150, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 224, 245, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 201, 154, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 227, 11, 84, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 231, 118, 72, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 15, 165, 212, 232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 159, 114, 78, 24, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 63, 122, 16, 243, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 127, 198, 164, 126, 141, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 192, 111, 242, 134, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 137, 93, 120, 69, 99, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 99, 167, 179, 182, 224, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 231, 137, 4, 35, 199, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 15, 99, 45, 94, 199, 107, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 159, 222, 197, 173, 201, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 63, 178, 186, 201, 224, 25, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 127, 246, 74, 225, 199, 2, 45, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 160, 237, 204, 206, 27, 194, 211, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 73, 72, 1, 20, 22, 149, 69, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 227, 210, 12, 200, 220, 210, 183, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 231, 60, 128, 208, 159, 60, 46, 59, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 15, 97, 2, 37, 62, 94, 206, 79, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 159, 202, 23, 114, 109, 174, 15, 30, 67, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 63, 234, 237, 116, 70, 208, 156, 44, 159, 12, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 127, 38, 75, 145, 192, 34, 32, 190, 55, 126, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 128, 239, 172, 133, 91, 65, 109, 45, 238, 4, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 9, 91, 193, 56, 147, 141, 68, 198, 77, 49, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 99, 142, 141, 55, 192, 135, 173, 190, 9, 237, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 231, 143, 135, 43, 130, 77, 199, 114, 97, 66, 19, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 15, 159, 75, 179, 21, 7, 201, 123, 206, 151, 192, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 159, 54, 244, 0, 217, 70, 218, 213, 16, 238, 133, 7, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 63, 34, 138, 9, 122, 196, 134, 90, 168, 76, 59, 75, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 127, 86, 101, 95, 196, 172, 67, 137, 147, 254, 80, 240, 2, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 96, 245, 185, 171, 191, 164, 92, 195, 241, 41, 99, 29, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 201, 149, 67, 181, 124, 111, 158, 161, 113, 163, 223, - 37, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 227, 217, 163, 20, 223, 90, 48, 80, 112, 98, 188, 122, - 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 231, 130, 102, 206, 182, 140, 227, 33, 99, 216, 91, 203, - 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 15, 29, 1, 16, 36, 127, 227, 82, 223, 115, 150, 241, - 123, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 159, 34, 11, 160, 104, 247, 226, 60, 185, 134, 224, 111, - 215, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 63, 90, 111, 64, 22, 170, 221, 96, 60, 67, 197, 94, 106, - 192, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 127, 134, 89, 132, 222, 164, 168, 200, 91, 160, 180, - 179, 39, 132, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 64, 127, 43, 177, 112, 150, 214, 149, 67, 14, 5, - 141, 41, 175, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 137, 248, 178, 235, 102, 224, 97, 218, 163, 142, - 50, 130, 159, 215, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 99, 181, 253, 52, 5, 196, 210, 135, 102, 146, 249, - 21, 59, 108, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 231, 21, 233, 17, 52, 168, 59, 78, 1, 184, 191, - 219, 78, 58, 172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 15, 219, 26, 179, 8, 146, 84, 14, 13, 48, 125, 149, - 20, 71, 186, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 159, 142, 12, 255, 86, 180, 77, 143, 130, 224, 227, - 214, 205, 198, 70, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 63, 146, 125, 246, 101, 11, 9, 153, 25, 197, 230, - 100, 10, 196, 195, 112, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 127, 182, 231, 160, 251, 113, 90, 250, 255, 178, 3, - 241, 103, 168, 165, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 32, 13, 73, 212, 115, 136, 199, 255, 253, 36, - 106, 15, 148, 120, 12, 20, 4, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 73, 131, 218, 74, 134, 84, 203, 253, 235, 113, - 37, 154, 200, 181, 124, 200, 40, 0, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 227, 32, 137, 236, 62, 77, 241, 233, 55, 115, - 118, 5, 214, 25, 223, 212, 151, 1, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 231, 72, 91, 61, 117, 4, 109, 35, 47, 128, - 160, 54, 92, 2, 183, 80, 238, 15, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 15, 217, 144, 101, 148, 44, 66, 98, 215, 1, - 69, 34, 154, 23, 38, 39, 79, 159, 0, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 159, 122, 168, 247, 203, 189, 149, 214, 105, - 18, 178, 86, 5, 236, 124, 135, 23, 57, 6, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 63, 202, 148, 172, 247, 105, 217, 97, 34, 184, - 244, 98, 53, 56, 225, 74, 235, 58, 62, 0, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 127, 230, 207, 189, 172, 35, 126, 210, 87, 49, - 143, 221, 21, 50, 204, 236, 48, 77, 110, 2, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 0, 31, 106, 191, 100, 237, 56, 110, 237, - 151, 167, 218, 244, 249, 63, 233, 3, 79, 24, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 9, 54, 37, 122, 239, 69, 57, 78, 70, 239, - 139, 138, 144, 195, 127, 28, 39, 22, 243, 0, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 99, 28, 116, 197, 90, 187, 60, 14, 191, - 88, 119, 105, 165, 163, 253, 28, 135, 221, 126, 9, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 231, 27, 137, 182, 139, 81, 95, 142, 118, - 119, 169, 30, 118, 100, 232, 33, 71, 167, 244, 94, 0, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 15, 23, 91, 33, 117, 47, 185, 143, 161, - 170, 158, 50, 157, 236, 19, 83, 199, 136, 142, 181, 3, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 159, 230, 142, 77, 147, 218, 59, 157, 79, - 170, 50, 250, 35, 62, 199, 62, 201, 87, 145, 23, 37, 0, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 63, 2, 149, 7, 193, 137, 86, 36, 28, 167, - 250, 197, 103, 109, 200, 115, 220, 109, 173, 235, 114, 1, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 127, 22, 210, 75, 138, 97, 97, 107, 25, - 135, 202, 187, 13, 70, 212, 133, 156, 74, 198, 52, 125, 14, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 224, 52, 246, 102, 207, 205, 49, - 254, 70, 233, 85, 137, 188, 74, 58, 29, 234, 190, 15, 228, 144, 0, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 201, 16, 158, 5, 26, 10, 242, 237, - 197, 28, 91, 93, 93, 235, 70, 36, 37, 117, 157, 232, 168, 5, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 227, 167, 44, 56, 4, 101, 116, 75, - 187, 31, 143, 165, 165, 49, 197, 106, 115, 147, 38, 22, 153, 56, 0, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 231, 142, 190, 49, 42, 242, 139, - 242, 80, 61, 151, 119, 120, 240, 179, 43, 130, 194, 129, 221, 250, 53, 2, - ]), - arrow::datatypes::i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 15, 149, 113, 241, 165, 117, 119, - 121, 41, 101, 232, 171, 180, 100, 7, 181, 21, 153, 17, 167, 204, 27, 22, - ]), -]; - -/// The minimum `i256` value that can be stored in a `Decimal256` value of precision `p`. -/// -/// Remove this once is available -const MIN_DECIMAL256_FOR_EACH_PRECISION: [arrow::datatypes::i256; 77] = [ - arrow::datatypes::i256::from_i128(0_i128), // unused first element - arrow::datatypes::i256::from_le_bytes([ - 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 157, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 25, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 241, 216, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 97, 121, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 193, 189, 240, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 129, 105, 103, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 31, 10, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 54, 101, 196, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 28, 244, 171, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 24, 137, 183, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 240, 90, 43, 23, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 96, 141, 177, 231, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 192, 133, 239, 12, 165, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 128, 57, 91, 129, 114, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 63, 144, 13, 121, 220, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 118, 162, 135, 186, 156, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 156, 88, 76, 73, 31, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 24, 118, 251, 220, 56, 117, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 240, 156, 210, 161, 56, 148, 250, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 96, 33, 58, 82, 54, 202, 201, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 192, 77, 69, 54, 31, 230, 225, 253, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 128, 9, 181, 30, 56, 253, 210, 234, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 95, 18, 51, 49, 228, 61, 44, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 182, 183, 254, 235, 233, 106, 186, 247, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 28, 45, 243, 55, 35, 45, 72, 173, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 24, 195, 127, 47, 96, 195, 209, 196, 252, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 240, 158, 253, 218, 193, 161, 49, 176, 223, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 96, 53, 232, 141, 146, 81, 240, 225, 188, 254, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 192, 21, 18, 139, 185, 47, 99, 211, 96, 243, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 128, 217, 180, 110, 63, 221, 223, 65, 200, 129, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 127, 16, 83, 122, 164, 190, 146, 210, 17, 251, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 246, 164, 62, 199, 108, 114, 187, 57, 178, 206, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 156, 113, 114, 200, 63, 120, 82, 65, 246, 18, 254, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 24, 112, 120, 212, 125, 178, 56, 141, 158, 189, 236, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 240, 96, 180, 76, 234, 248, 54, 132, 49, 104, 63, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 96, 201, 11, 255, 38, 185, 37, 42, 239, 17, 122, 248, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 192, 221, 117, 246, 133, 59, 121, 165, 87, 179, 196, 180, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 128, 169, 154, 160, 59, 83, 188, 118, 108, 1, 175, 15, 253, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 159, 10, 70, 84, 64, 91, 163, 60, 14, 214, 156, 226, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 54, 106, 188, 74, 131, 144, 97, 94, 142, 92, 32, 218, 254, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 28, 38, 92, 235, 32, 165, 207, 175, 143, 157, 67, 133, 244, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 24, 125, 153, 49, 73, 115, 28, 222, 156, 39, 164, 52, 141, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 240, 226, 254, 239, 219, 128, 28, 173, 32, 140, 105, 14, 132, 251, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 96, 221, 244, 95, 151, 8, 29, 195, 70, 121, 31, 144, 40, 211, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 192, 165, 144, 191, 233, 85, 34, 159, 195, 188, 58, 161, 149, 63, - 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 128, 121, 166, 123, 33, 91, 87, 55, 164, 95, 75, 76, 216, 123, - 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 191, 128, 212, 78, 143, 105, 41, 106, 188, 241, 250, 114, 214, - 80, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 118, 7, 77, 20, 153, 31, 158, 37, 92, 113, 205, 125, 96, 40, - 249, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 156, 74, 2, 203, 250, 59, 45, 120, 153, 109, 6, 234, 196, 147, - 187, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 24, 234, 22, 238, 203, 87, 196, 177, 254, 71, 64, 36, 177, 197, - 83, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 240, 36, 229, 76, 247, 109, 171, 241, 242, 207, 130, 106, 235, - 184, 69, 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 96, 113, 243, 0, 169, 75, 178, 112, 125, 31, 28, 41, 50, 57, - 185, 244, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 192, 109, 130, 9, 154, 244, 246, 102, 230, 58, 25, 155, 245, - 59, 60, 143, 245, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 128, 73, 24, 95, 4, 142, 165, 5, 0, 77, 252, 14, 152, 87, 90, - 152, 151, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 223, 242, 182, 43, 140, 119, 56, 0, 2, 219, 149, 240, 107, - 135, 243, 235, 251, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 182, 124, 37, 181, 121, 171, 52, 2, 20, 142, 218, 101, 55, - 74, 131, 55, 215, 255, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 28, 223, 118, 19, 193, 178, 14, 22, 200, 140, 137, 250, 41, - 230, 32, 43, 104, 254, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 24, 183, 164, 194, 138, 251, 146, 220, 208, 127, 95, 201, - 163, 253, 72, 175, 17, 240, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 240, 38, 111, 154, 107, 211, 189, 157, 40, 254, 186, 221, - 101, 232, 217, 216, 176, 96, 255, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 96, 133, 87, 8, 52, 66, 106, 41, 150, 237, 77, 169, 250, 19, - 131, 120, 232, 198, 249, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 192, 53, 107, 83, 8, 150, 38, 158, 221, 71, 11, 157, 202, - 199, 30, 181, 20, 197, 193, 255, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 128, 25, 48, 66, 83, 220, 129, 45, 168, 206, 112, 34, 234, - 205, 51, 19, 207, 178, 145, 253, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 255, 224, 149, 64, 155, 18, 199, 145, 18, 104, 88, 37, - 11, 6, 192, 22, 252, 176, 231, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 246, 201, 218, 133, 16, 186, 198, 177, 185, 16, 116, 117, - 111, 60, 128, 227, 216, 233, 12, 255, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 156, 227, 139, 58, 165, 68, 195, 241, 64, 167, 136, 150, - 90, 92, 2, 227, 120, 34, 129, 246, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 24, 228, 118, 73, 116, 174, 160, 113, 137, 136, 86, 225, - 137, 155, 23, 222, 184, 88, 11, 161, 255, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 240, 232, 164, 222, 138, 208, 70, 112, 94, 85, 97, 205, - 98, 19, 236, 172, 56, 119, 113, 74, 252, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 96, 25, 113, 178, 108, 37, 196, 98, 176, 85, 205, 5, 220, - 193, 56, 193, 54, 168, 110, 232, 218, 255, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 192, 253, 106, 248, 62, 118, 169, 219, 227, 88, 5, 58, - 152, 146, 55, 140, 35, 146, 82, 20, 141, 254, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 128, 233, 45, 180, 117, 158, 158, 148, 230, 120, 53, 68, - 242, 185, 43, 122, 99, 181, 57, 203, 130, 241, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 31, 203, 9, 153, 48, 50, 206, 1, 185, 22, 170, 118, - 67, 181, 197, 226, 21, 65, 240, 27, 111, 255, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 54, 239, 97, 250, 229, 245, 13, 18, 58, 227, 164, 162, - 162, 20, 185, 219, 218, 138, 98, 23, 87, 250, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 28, 88, 211, 199, 251, 154, 139, 180, 68, 224, 112, - 90, 90, 206, 58, 149, 140, 108, 217, 233, 102, 199, 255, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 24, 113, 65, 206, 213, 13, 116, 13, 175, 194, 104, - 136, 135, 15, 76, 212, 125, 61, 126, 34, 5, 202, 253, - ]), - arrow::datatypes::i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 106, 142, 14, 90, 138, 136, 134, 214, 154, 23, - 84, 75, 155, 248, 74, 234, 102, 238, 88, 51, 228, 233, - ]), -]; - macro_rules! value_transition { ($bound:ident, $direction:expr, $value:expr) => { match $value { diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 571c17119427..a760a3a632bc 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -507,18 +507,19 @@ fn type_union_resolution_coercion( None } - let types = lhs + let coerced_types = lhs .iter() .map(|lhs_field| search_corresponding_coerced_type(lhs_field, rhs)) .collect::>>()?; - let fields = types + // preserve the field name and nullability + let orig_fields = std::iter::zip(lhs.iter(), rhs.iter()); + + let fields: Vec = coerced_types .into_iter() - .enumerate() - .map(|(i, datatype)| { - Arc::new(Field::new(format!("c{i}"), datatype, true)) - }) - .collect::>(); + .zip(orig_fields) + .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs)) + .collect(); Some(DataType::Struct(fields.into())) } _ => { @@ -684,8 +685,10 @@ fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Some(Utf8), (LargeUtf8, _) if rhs_type.is_numeric() => Some(LargeUtf8), + (Utf8View, _) if rhs_type.is_numeric() => Some(Utf8View), (_, Utf8) if lhs_type.is_numeric() => Some(Utf8), (_, LargeUtf8) if lhs_type.is_numeric() => Some(LargeUtf8), + (_, Utf8View) if lhs_type.is_numeric() => Some(Utf8View), _ => None, } } diff --git a/datafusion/functions-aggregate-common/src/merge_arrays.rs b/datafusion/functions-aggregate-common/src/merge_arrays.rs index 9b9a1240c1a1..0cfea662497e 100644 --- a/datafusion/functions-aggregate-common/src/merge_arrays.rs +++ b/datafusion/functions-aggregate-common/src/merge_arrays.rs @@ -193,3 +193,149 @@ pub fn merge_ordered_arrays( Ok((merged_values, merged_orderings)) } + +#[cfg(test)] +mod tests { + use super::*; + + use std::collections::VecDeque; + use std::sync::Arc; + + use arrow::array::{ArrayRef, Int64Array}; + + use datafusion_common::utils::get_row_at_idx; + use datafusion_common::{Result, ScalarValue}; + + #[test] + fn test_merge_asc() -> Result<()> { + let lhs_arrays: Vec = vec![ + Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2])), + Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])), + ]; + let n_row = lhs_arrays[0].len(); + let lhs_orderings = (0..n_row) + .map(|idx| get_row_at_idx(&lhs_arrays, idx)) + .collect::>>()?; + + let rhs_arrays: Vec = vec![ + Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2])), + Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])), + ]; + let n_row = rhs_arrays[0].len(); + let rhs_orderings = (0..n_row) + .map(|idx| get_row_at_idx(&rhs_arrays, idx)) + .collect::>>()?; + let sort_options = vec![ + SortOptions { + descending: false, + nulls_first: false, + }, + SortOptions { + descending: false, + nulls_first: false, + }, + ]; + + let lhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])) as ArrayRef; + let lhs_vals = (0..lhs_vals_arr.len()) + .map(|idx| ScalarValue::try_from_array(&lhs_vals_arr, idx)) + .collect::>>()?; + + let rhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])) as ArrayRef; + let rhs_vals = (0..rhs_vals_arr.len()) + .map(|idx| ScalarValue::try_from_array(&rhs_vals_arr, idx)) + .collect::>>()?; + let expected = + Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 3, 3, 4, 4])) as ArrayRef; + let expected_ts = vec![ + Arc::new(Int64Array::from(vec![0, 0, 0, 0, 1, 1, 1, 1, 2, 2])) as ArrayRef, + Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 3, 3, 4, 4])) as ArrayRef, + ]; + + let (merged_vals, merged_ts) = merge_ordered_arrays( + &mut [lhs_vals, rhs_vals], + &mut [lhs_orderings, rhs_orderings], + &sort_options, + )?; + let merged_vals = ScalarValue::iter_to_array(merged_vals.into_iter())?; + let merged_ts = (0..merged_ts[0].len()) + .map(|col_idx| { + ScalarValue::iter_to_array( + (0..merged_ts.len()) + .map(|row_idx| merged_ts[row_idx][col_idx].clone()), + ) + }) + .collect::>>()?; + + assert_eq!(&merged_vals, &expected); + assert_eq!(&merged_ts, &expected_ts); + + Ok(()) + } + + #[test] + fn test_merge_desc() -> Result<()> { + let lhs_arrays: Vec = vec![ + Arc::new(Int64Array::from(vec![2, 1, 1, 0, 0])), + Arc::new(Int64Array::from(vec![4, 3, 2, 1, 0])), + ]; + let n_row = lhs_arrays[0].len(); + let lhs_orderings = (0..n_row) + .map(|idx| get_row_at_idx(&lhs_arrays, idx)) + .collect::>>()?; + + let rhs_arrays: Vec = vec![ + Arc::new(Int64Array::from(vec![2, 1, 1, 0, 0])), + Arc::new(Int64Array::from(vec![4, 3, 2, 1, 0])), + ]; + let n_row = rhs_arrays[0].len(); + let rhs_orderings = (0..n_row) + .map(|idx| get_row_at_idx(&rhs_arrays, idx)) + .collect::>>()?; + let sort_options = vec![ + SortOptions { + descending: true, + nulls_first: false, + }, + SortOptions { + descending: true, + nulls_first: false, + }, + ]; + + // Values (which will be merged) doesn't have to be ordered. + let lhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 1, 2])) as ArrayRef; + let lhs_vals = (0..lhs_vals_arr.len()) + .map(|idx| ScalarValue::try_from_array(&lhs_vals_arr, idx)) + .collect::>>()?; + + let rhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 1, 2])) as ArrayRef; + let rhs_vals = (0..rhs_vals_arr.len()) + .map(|idx| ScalarValue::try_from_array(&rhs_vals_arr, idx)) + .collect::>>()?; + let expected = + Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 1, 1, 2, 2])) as ArrayRef; + let expected_ts = vec![ + Arc::new(Int64Array::from(vec![2, 2, 1, 1, 1, 1, 0, 0, 0, 0])) as ArrayRef, + Arc::new(Int64Array::from(vec![4, 4, 3, 3, 2, 2, 1, 1, 0, 0])) as ArrayRef, + ]; + let (merged_vals, merged_ts) = merge_ordered_arrays( + &mut [lhs_vals, rhs_vals], + &mut [lhs_orderings, rhs_orderings], + &sort_options, + )?; + let merged_vals = ScalarValue::iter_to_array(merged_vals.into_iter())?; + let merged_ts = (0..merged_ts[0].len()) + .map(|col_idx| { + ScalarValue::iter_to_array( + (0..merged_ts.len()) + .map(|row_idx| merged_ts[row_idx][col_idx].clone()), + ) + }) + .collect::>>()?; + + assert_eq!(&merged_vals, &expected); + assert_eq!(&merged_ts, &expected_ts); + Ok(()) + } +} diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 9fff05999122..2a2b73b1b5e7 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -18,9 +18,9 @@ //! `ARRAY_AGG` aggregate implementation: [`ArrayAgg`] use arrow::array::{new_empty_array, Array, ArrayRef, AsArray, ListArray, StructArray}; -use arrow::datatypes::DataType; +use arrow::compute::SortOptions; +use arrow::datatypes::{DataType, Field, Fields}; -use arrow_schema::{Field, Fields}; use datafusion_common::cast::as_list_array; use datafusion_common::utils::{get_row_at_idx, SingleRowListArrayBuilder}; use datafusion_common::{exec_err, ScalarValue}; @@ -33,6 +33,7 @@ use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; use datafusion_functions_aggregate_common::utils::ordering_fields; use datafusion_macros::user_doc; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; +use std::cmp::Ordering; use std::collections::{HashSet, VecDeque}; use std::mem::{size_of, size_of_val}; use std::sync::Arc; @@ -132,7 +133,32 @@ impl AggregateUDFImpl for ArrayAgg { let data_type = acc_args.exprs[0].data_type(acc_args.schema)?; if acc_args.is_distinct { - return Ok(Box::new(DistinctArrayAggAccumulator::try_new(&data_type)?)); + // Limitation similar to Postgres. The aggregation function can only mix + // DISTINCT and ORDER BY if all the expressions in the ORDER BY appear + // also in the arguments of the function. For example: + // + // ARRAY_AGG(DISTINCT col) + // + // can only be mixed with an ORDER BY if the order expression is "col". + // + // ARRAY_AGG(DISTINCT col ORDER BY col) <- Valid + // ARRAY_AGG(DISTINCT concat(col, '') ORDER BY concat(col, '')) <- Valid + // ARRAY_AGG(DISTINCT col ORDER BY other_col) <- Invalid + // ARRAY_AGG(DISTINCT col ORDER BY concat(col, '')) <- Invalid + if acc_args.ordering_req.len() > 1 { + return exec_err!("In an aggregate with DISTINCT, ORDER BY expressions must appear in argument list"); + } + let mut sort_option: Option = None; + if let Some(order) = acc_args.ordering_req.first() { + if !order.expr.eq(&acc_args.exprs[0]) { + return exec_err!("In an aggregate with DISTINCT, ORDER BY expressions must appear in argument list"); + } + sort_option = Some(order.options) + } + return Ok(Box::new(DistinctArrayAggAccumulator::try_new( + &data_type, + sort_option, + )?)); } if acc_args.ordering_req.is_empty() { @@ -322,13 +348,18 @@ impl Accumulator for ArrayAggAccumulator { struct DistinctArrayAggAccumulator { values: HashSet, datatype: DataType, + sort_options: Option, } impl DistinctArrayAggAccumulator { - pub fn try_new(datatype: &DataType) -> Result { + pub fn try_new( + datatype: &DataType, + sort_options: Option, + ) -> Result { Ok(Self { values: HashSet::new(), datatype: datatype.clone(), + sort_options, }) } } @@ -339,8 +370,8 @@ impl Accumulator for DistinctArrayAggAccumulator { } fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - if values.len() != 1 { - return internal_err!("expects single batch"); + if values.is_empty() { + return Ok(()); } let array = &values[0]; @@ -370,10 +401,32 @@ impl Accumulator for DistinctArrayAggAccumulator { } fn evaluate(&mut self) -> Result { - let values: Vec = self.values.iter().cloned().collect(); + let mut values: Vec = self.values.iter().cloned().collect(); if values.is_empty() { return Ok(ScalarValue::new_null_list(self.datatype.clone(), true, 1)); } + + if let Some(opts) = self.sort_options { + values.sort_by(|a, b| { + if a.is_null() { + return match opts.nulls_first { + true => Ordering::Less, + false => Ordering::Greater, + }; + } + if b.is_null() { + return match opts.nulls_first { + true => Ordering::Greater, + false => Ordering::Less, + }; + } + match opts.descending { + true => b.partial_cmp(a).unwrap_or(Ordering::Equal), + false => a.partial_cmp(b).unwrap_or(Ordering::Equal), + } + }); + }; + let arr = ScalarValue::new_list(&values, &self.datatype, true); Ok(ScalarValue::List(arr)) } @@ -383,6 +436,8 @@ impl Accumulator for DistinctArrayAggAccumulator { - size_of_val(&self.values) + self.datatype.size() - size_of_val(&self.datatype) + - size_of_val(&self.sort_options) + + size_of::>() } } @@ -599,146 +654,321 @@ impl OrderSensitiveArrayAggAccumulator { #[cfg(test)] mod tests { use super::*; - - use std::collections::VecDeque; + use arrow::datatypes::{FieldRef, Schema}; + use datafusion_common::cast::as_generic_string_array; + use datafusion_common::internal_err; + use datafusion_physical_expr::expressions::Column; + use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use std::sync::Arc; - use arrow::array::Int64Array; - use arrow_schema::SortOptions; + #[test] + fn no_duplicates_no_distinct() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string().build_two()?; + + acc1.update_batch(&[data(["a", "b", "c"])])?; + acc2.update_batch(&[data(["d", "e", "f"])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); - use datafusion_common::utils::get_row_at_idx; - use datafusion_common::{Result, ScalarValue}; + assert_eq!(result, vec!["a", "b", "c", "d", "e", "f"]); + + Ok(()) + } #[test] - fn test_merge_asc() -> Result<()> { - let lhs_arrays: Vec = vec![ - Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2])), - Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])), - ]; - let n_row = lhs_arrays[0].len(); - let lhs_orderings = (0..n_row) - .map(|idx| get_row_at_idx(&lhs_arrays, idx)) - .collect::>>()?; - - let rhs_arrays: Vec = vec![ - Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2])), - Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])), - ]; - let n_row = rhs_arrays[0].len(); - let rhs_orderings = (0..n_row) - .map(|idx| get_row_at_idx(&rhs_arrays, idx)) - .collect::>>()?; - let sort_options = vec![ - SortOptions { - descending: false, - nulls_first: false, - }, - SortOptions { - descending: false, - nulls_first: false, - }, - ]; - - let lhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])) as ArrayRef; - let lhs_vals = (0..lhs_vals_arr.len()) - .map(|idx| ScalarValue::try_from_array(&lhs_vals_arr, idx)) - .collect::>>()?; - - let rhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4])) as ArrayRef; - let rhs_vals = (0..rhs_vals_arr.len()) - .map(|idx| ScalarValue::try_from_array(&rhs_vals_arr, idx)) - .collect::>>()?; - let expected = - Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 3, 3, 4, 4])) as ArrayRef; - let expected_ts = vec![ - Arc::new(Int64Array::from(vec![0, 0, 0, 0, 1, 1, 1, 1, 2, 2])) as ArrayRef, - Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 3, 3, 4, 4])) as ArrayRef, - ]; - - let (merged_vals, merged_ts) = merge_ordered_arrays( - &mut [lhs_vals, rhs_vals], - &mut [lhs_orderings, rhs_orderings], - &sort_options, - )?; - let merged_vals = ScalarValue::iter_to_array(merged_vals.into_iter())?; - let merged_ts = (0..merged_ts[0].len()) - .map(|col_idx| { - ScalarValue::iter_to_array( - (0..merged_ts.len()) - .map(|row_idx| merged_ts[row_idx][col_idx].clone()), - ) - }) - .collect::>>()?; + fn no_duplicates_distinct() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .build_two()?; + + acc1.update_batch(&[data(["a", "b", "c"])])?; + acc2.update_batch(&[data(["d", "e", "f"])])?; + acc1 = merge(acc1, acc2)?; - assert_eq!(&merged_vals, &expected); - assert_eq!(&merged_ts, &expected_ts); + let mut result = print_nulls(str_arr(acc1.evaluate()?)?); + result.sort(); + + assert_eq!(result, vec!["a", "b", "c", "d", "e", "f"]); Ok(()) } #[test] - fn test_merge_desc() -> Result<()> { - let lhs_arrays: Vec = vec![ - Arc::new(Int64Array::from(vec![2, 1, 1, 0, 0])), - Arc::new(Int64Array::from(vec![4, 3, 2, 1, 0])), - ]; - let n_row = lhs_arrays[0].len(); - let lhs_orderings = (0..n_row) - .map(|idx| get_row_at_idx(&lhs_arrays, idx)) - .collect::>>()?; - - let rhs_arrays: Vec = vec![ - Arc::new(Int64Array::from(vec![2, 1, 1, 0, 0])), - Arc::new(Int64Array::from(vec![4, 3, 2, 1, 0])), - ]; - let n_row = rhs_arrays[0].len(); - let rhs_orderings = (0..n_row) - .map(|idx| get_row_at_idx(&rhs_arrays, idx)) - .collect::>>()?; - let sort_options = vec![ - SortOptions { - descending: true, - nulls_first: false, - }, - SortOptions { - descending: true, - nulls_first: false, - }, - ]; - - // Values (which will be merged) doesn't have to be ordered. - let lhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 1, 2])) as ArrayRef; - let lhs_vals = (0..lhs_vals_arr.len()) - .map(|idx| ScalarValue::try_from_array(&lhs_vals_arr, idx)) - .collect::>>()?; - - let rhs_vals_arr = Arc::new(Int64Array::from(vec![0, 1, 2, 1, 2])) as ArrayRef; - let rhs_vals = (0..rhs_vals_arr.len()) - .map(|idx| ScalarValue::try_from_array(&rhs_vals_arr, idx)) - .collect::>>()?; - let expected = - Arc::new(Int64Array::from(vec![0, 0, 1, 1, 2, 2, 1, 1, 2, 2])) as ArrayRef; - let expected_ts = vec![ - Arc::new(Int64Array::from(vec![2, 2, 1, 1, 1, 1, 0, 0, 0, 0])) as ArrayRef, - Arc::new(Int64Array::from(vec![4, 4, 3, 3, 2, 2, 1, 1, 0, 0])) as ArrayRef, - ]; - let (merged_vals, merged_ts) = merge_ordered_arrays( - &mut [lhs_vals, rhs_vals], - &mut [lhs_orderings, rhs_orderings], - &sort_options, - )?; - let merged_vals = ScalarValue::iter_to_array(merged_vals.into_iter())?; - let merged_ts = (0..merged_ts[0].len()) - .map(|col_idx| { - ScalarValue::iter_to_array( - (0..merged_ts.len()) - .map(|row_idx| merged_ts[row_idx][col_idx].clone()), - ) - }) - .collect::>>()?; + fn duplicates_no_distinct() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string().build_two()?; + + acc1.update_batch(&[data(["a", "b", "c"])])?; + acc2.update_batch(&[data(["a", "b", "c"])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["a", "b", "c", "a", "b", "c"]); - assert_eq!(&merged_vals, &expected); - assert_eq!(&merged_ts, &expected_ts); Ok(()) } + + #[test] + fn duplicates_distinct() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .build_two()?; + + acc1.update_batch(&[data(["a", "b", "c"])])?; + acc2.update_batch(&[data(["a", "b", "c"])])?; + acc1 = merge(acc1, acc2)?; + + let mut result = print_nulls(str_arr(acc1.evaluate()?)?); + result.sort(); + + assert_eq!(result, vec!["a", "b", "c"]); + + Ok(()) + } + + #[test] + fn no_duplicates_distinct_sort_asc() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .order_by_col("col", SortOptions::new(false, false)) + .build_two()?; + + acc1.update_batch(&[data(["e", "b", "d"])])?; + acc2.update_batch(&[data(["f", "a", "c"])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["a", "b", "c", "d", "e", "f"]); + + Ok(()) + } + + #[test] + fn no_duplicates_distinct_sort_desc() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .order_by_col("col", SortOptions::new(true, false)) + .build_two()?; + + acc1.update_batch(&[data(["e", "b", "d"])])?; + acc2.update_batch(&[data(["f", "a", "c"])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["f", "e", "d", "c", "b", "a"]); + + Ok(()) + } + + #[test] + fn duplicates_distinct_sort_asc() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .order_by_col("col", SortOptions::new(false, false)) + .build_two()?; + + acc1.update_batch(&[data(["a", "c", "b"])])?; + acc2.update_batch(&[data(["b", "c", "a"])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["a", "b", "c"]); + + Ok(()) + } + + #[test] + fn duplicates_distinct_sort_desc() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .order_by_col("col", SortOptions::new(true, false)) + .build_two()?; + + acc1.update_batch(&[data(["a", "c", "b"])])?; + acc2.update_batch(&[data(["b", "c", "a"])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["c", "b", "a"]); + + Ok(()) + } + + #[test] + fn no_duplicates_distinct_sort_asc_nulls_first() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .order_by_col("col", SortOptions::new(false, true)) + .build_two()?; + + acc1.update_batch(&[data([Some("e"), Some("b"), None])])?; + acc2.update_batch(&[data([Some("f"), Some("a"), None])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["NULL", "a", "b", "e", "f"]); + + Ok(()) + } + + #[test] + fn no_duplicates_distinct_sort_asc_nulls_last() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .order_by_col("col", SortOptions::new(false, false)) + .build_two()?; + + acc1.update_batch(&[data([Some("e"), Some("b"), None])])?; + acc2.update_batch(&[data([Some("f"), Some("a"), None])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["a", "b", "e", "f", "NULL"]); + + Ok(()) + } + + #[test] + fn no_duplicates_distinct_sort_desc_nulls_first() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .order_by_col("col", SortOptions::new(true, true)) + .build_two()?; + + acc1.update_batch(&[data([Some("e"), Some("b"), None])])?; + acc2.update_batch(&[data([Some("f"), Some("a"), None])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["NULL", "f", "e", "b", "a"]); + + Ok(()) + } + + #[test] + fn no_duplicates_distinct_sort_desc_nulls_last() -> Result<()> { + let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string() + .distinct() + .order_by_col("col", SortOptions::new(true, false)) + .build_two()?; + + acc1.update_batch(&[data([Some("e"), Some("b"), None])])?; + acc2.update_batch(&[data([Some("f"), Some("a"), None])])?; + acc1 = merge(acc1, acc2)?; + + let result = print_nulls(str_arr(acc1.evaluate()?)?); + + assert_eq!(result, vec!["f", "e", "b", "a", "NULL"]); + + Ok(()) + } + + struct ArrayAggAccumulatorBuilder { + data_type: DataType, + distinct: bool, + ordering: LexOrdering, + schema: Schema, + } + + impl ArrayAggAccumulatorBuilder { + fn string() -> Self { + Self::new(DataType::Utf8) + } + + fn new(data_type: DataType) -> Self { + Self { + data_type: data_type.clone(), + distinct: Default::default(), + ordering: Default::default(), + schema: Schema { + fields: Fields::from(vec![Field::new( + "col", + DataType::List(FieldRef::new(Field::new( + "item", data_type, true, + ))), + true, + )]), + metadata: Default::default(), + }, + } + } + + fn distinct(mut self) -> Self { + self.distinct = true; + self + } + + fn order_by_col(mut self, col: &str, sort_options: SortOptions) -> Self { + self.ordering.extend([PhysicalSortExpr::new( + Arc::new( + Column::new_with_schema(col, &self.schema) + .expect("column not available in schema"), + ), + sort_options, + )]); + self + } + + fn build(&self) -> Result> { + ArrayAgg::default().accumulator(AccumulatorArgs { + return_type: &self.data_type, + schema: &self.schema, + ignore_nulls: false, + ordering_req: &self.ordering, + is_reversed: false, + name: "", + is_distinct: self.distinct, + exprs: &[Arc::new(Column::new("col", 0))], + }) + } + + fn build_two(&self) -> Result<(Box, Box)> { + Ok((self.build()?, self.build()?)) + } + } + + fn str_arr(value: ScalarValue) -> Result>> { + let ScalarValue::List(list) = value else { + return internal_err!("ScalarValue was not a List"); + }; + Ok(as_generic_string_array::(list.values())? + .iter() + .map(|v| v.map(|v| v.to_string())) + .collect()) + } + + fn print_nulls(sort: Vec>) -> Vec { + sort.into_iter() + .map(|v| v.unwrap_or("NULL".to_string())) + .collect() + } + + fn data(list: [T; N]) -> ArrayRef + where + ScalarValue: From, + { + let values: Vec<_> = list.into_iter().map(ScalarValue::from).collect(); + ScalarValue::iter_to_array(values).expect("Cannot convert to array") + } + + fn merge( + mut acc1: Box, + mut acc2: Box, + ) -> Result> { + let intermediate_state = acc2.state().and_then(|e| { + e.iter() + .map(|v| v.to_array()) + .collect::>>() + })?; + acc1.merge_batch(&intermediate_state)?; + Ok(acc1) + } } diff --git a/datafusion/functions-aggregate/src/string_agg.rs b/datafusion/functions-aggregate/src/string_agg.rs index 0cd403cff428..ab064ad666de 100644 --- a/datafusion/functions-aggregate/src/string_agg.rs +++ b/datafusion/functions-aggregate/src/string_agg.rs @@ -17,15 +17,17 @@ //! [`StringAgg`] accumulator for the `string_agg` function +use crate::array_agg::ArrayAgg; use arrow::array::ArrayRef; -use arrow_schema::DataType; +use arrow::datatypes::{DataType, Field}; use datafusion_common::cast::as_generic_string_array; use datafusion_common::Result; -use datafusion_common::{not_impl_err, ScalarValue}; +use datafusion_common::{internal_err, not_impl_err, ScalarValue}; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::{ Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature, Volatility, }; +use datafusion_functions_aggregate_common::accumulator::StateFieldsArgs; use datafusion_macros::user_doc; use datafusion_physical_expr::expressions::Literal; use std::any::Any; @@ -65,6 +67,7 @@ make_udaf_expr_and_func!( #[derive(Debug)] pub struct StringAgg { signature: Signature, + array_agg: ArrayAgg, } impl StringAgg { @@ -76,9 +79,13 @@ impl StringAgg { TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8]), TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Null]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::LargeUtf8]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::Null]), ], Volatility::Immutable, ), + array_agg: Default::default(), } } } @@ -106,20 +113,40 @@ impl AggregateUDFImpl for StringAgg { Ok(DataType::LargeUtf8) } + fn state_fields(&self, args: StateFieldsArgs) -> Result> { + self.array_agg.state_fields(args) + } + fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { - if let Some(lit) = acc_args.exprs[1].as_any().downcast_ref::() { - return match lit.value().try_as_str() { - Some(Some(delimiter)) => { - Ok(Box::new(StringAggAccumulator::new(delimiter))) - } - Some(None) => Ok(Box::new(StringAggAccumulator::new(""))), - None => { - not_impl_err!("StringAgg not supported for delimiter {}", lit.value()) - } - }; - } + let Some(lit) = acc_args.exprs[1].as_any().downcast_ref::() else { + return not_impl_err!( + "The second argument of the string_agg function must be a string literal" + ); + }; + + let delimiter = if lit.value().is_null() { + // If the second argument (the delimiter that joins strings) is NULL, join + // on an empty string. (e.g. [a, b, c] => "abc"). + "" + } else if let Some(lit_string) = lit.value().try_as_str() { + lit_string.unwrap_or("") + } else { + return not_impl_err!( + "StringAgg not supported for delimiter {}", + lit.value() + ); + }; - not_impl_err!("expect literal") + let array_agg_acc = self.array_agg.accumulator(AccumulatorArgs { + return_type: &DataType::new_list(acc_args.return_type.clone(), true), + exprs: &filter_index(acc_args.exprs, 1), + ..acc_args + })?; + + Ok(Box::new(StringAggAccumulator::new( + array_agg_acc, + delimiter, + ))) } fn documentation(&self) -> Option<&Documentation> { @@ -129,14 +156,14 @@ impl AggregateUDFImpl for StringAgg { #[derive(Debug)] pub(crate) struct StringAggAccumulator { - values: Option, + array_agg_acc: Box, delimiter: String, } impl StringAggAccumulator { - pub fn new(delimiter: &str) -> Self { + pub fn new(array_agg_acc: Box, delimiter: &str) -> Self { Self { - values: None, + array_agg_acc, delimiter: delimiter.to_string(), } } @@ -144,37 +171,311 @@ impl StringAggAccumulator { impl Accumulator for StringAggAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - let string_array: Vec<_> = as_generic_string_array::(&values[0])? - .iter() - .filter_map(|v| v.as_ref().map(ToString::to_string)) - .collect(); - if !string_array.is_empty() { - let s = string_array.join(self.delimiter.as_str()); - let v = self.values.get_or_insert("".to_string()); - if !v.is_empty() { - v.push_str(self.delimiter.as_str()); + self.array_agg_acc.update_batch(&filter_index(values, 1)) + } + + fn evaluate(&mut self) -> Result { + let scalar = self.array_agg_acc.evaluate()?; + + let ScalarValue::List(list) = scalar else { + return internal_err!("Expected a DataType::List while evaluating underlying ArrayAggAccumulator, but got {}", scalar.data_type()); + }; + + let string_arr: Vec<_> = match list.value_type() { + DataType::LargeUtf8 => as_generic_string_array::(list.values())? + .iter() + .flatten() + .collect(), + DataType::Utf8 => as_generic_string_array::(list.values())? + .iter() + .flatten() + .collect(), + _ => { + return internal_err!( + "Expected elements to of type Utf8 or LargeUtf8, but got {}", + list.value_type() + ) } - v.push_str(s.as_str()); + }; + + if string_arr.is_empty() { + return Ok(ScalarValue::LargeUtf8(None)); } - Ok(()) + + Ok(ScalarValue::LargeUtf8(Some( + string_arr.join(&self.delimiter), + ))) + } + + fn size(&self) -> usize { + size_of_val(self) - size_of_val(&self.array_agg_acc) + + self.array_agg_acc.size() + + self.delimiter.capacity() + } + + fn state(&mut self) -> Result> { + self.array_agg_acc.state() } fn merge_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - self.update_batch(values)?; + self.array_agg_acc.merge_batch(values) + } +} + +fn filter_index(values: &[T], index: usize) -> Vec { + values + .iter() + .enumerate() + .filter(|(i, _)| *i != index) + .map(|(_, v)| v) + .cloned() + .collect::>() +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::LargeStringArray; + use arrow::compute::SortOptions; + use arrow::datatypes::{Fields, Schema}; + use datafusion_common::internal_err; + use datafusion_physical_expr::expressions::Column; + use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; + use std::sync::Arc; + + #[test] + fn no_duplicates_no_distinct() -> Result<()> { + let (mut acc1, mut acc2) = StringAggAccumulatorBuilder::new(",").build_two()?; + + acc1.update_batch(&[data(["a", "b", "c"]), data([","])])?; + acc2.update_batch(&[data(["d", "e", "f"]), data([","])])?; + acc1 = merge(acc1, acc2)?; + + let result = some_str(acc1.evaluate()?); + + assert_eq!(result, "a,b,c,d,e,f"); + Ok(()) } - fn state(&mut self) -> Result> { - Ok(vec![self.evaluate()?]) + #[test] + fn no_duplicates_distinct() -> Result<()> { + let (mut acc1, mut acc2) = StringAggAccumulatorBuilder::new(",") + .distinct() + .build_two()?; + + acc1.update_batch(&[data(["a", "b", "c"]), data([","])])?; + acc2.update_batch(&[data(["d", "e", "f"]), data([","])])?; + acc1 = merge(acc1, acc2)?; + + let result = some_str_sorted(acc1.evaluate()?, ","); + + assert_eq!(result, "a,b,c,d,e,f"); + + Ok(()) } - fn evaluate(&mut self) -> Result { - Ok(ScalarValue::LargeUtf8(self.values.clone())) + #[test] + fn duplicates_no_distinct() -> Result<()> { + let (mut acc1, mut acc2) = StringAggAccumulatorBuilder::new(",").build_two()?; + + acc1.update_batch(&[data(["a", "b", "c"]), data([","])])?; + acc2.update_batch(&[data(["a", "b", "c"]), data([","])])?; + acc1 = merge(acc1, acc2)?; + + let result = some_str(acc1.evaluate()?); + + assert_eq!(result, "a,b,c,a,b,c"); + + Ok(()) } - fn size(&self) -> usize { - size_of_val(self) - + self.values.as_ref().map(|v| v.capacity()).unwrap_or(0) - + self.delimiter.capacity() + #[test] + fn duplicates_distinct() -> Result<()> { + let (mut acc1, mut acc2) = StringAggAccumulatorBuilder::new(",") + .distinct() + .build_two()?; + + acc1.update_batch(&[data(["a", "b", "c"]), data([","])])?; + acc2.update_batch(&[data(["a", "b", "c"]), data([","])])?; + acc1 = merge(acc1, acc2)?; + + let result = some_str_sorted(acc1.evaluate()?, ","); + + assert_eq!(result, "a,b,c"); + + Ok(()) + } + + #[test] + fn no_duplicates_distinct_sort_asc() -> Result<()> { + let (mut acc1, mut acc2) = StringAggAccumulatorBuilder::new(",") + .distinct() + .order_by_col("col", SortOptions::new(false, false)) + .build_two()?; + + acc1.update_batch(&[data(["e", "b", "d"]), data([","])])?; + acc2.update_batch(&[data(["f", "a", "c"]), data([","])])?; + acc1 = merge(acc1, acc2)?; + + let result = some_str(acc1.evaluate()?); + + assert_eq!(result, "a,b,c,d,e,f"); + + Ok(()) + } + + #[test] + fn no_duplicates_distinct_sort_desc() -> Result<()> { + let (mut acc1, mut acc2) = StringAggAccumulatorBuilder::new(",") + .distinct() + .order_by_col("col", SortOptions::new(true, false)) + .build_two()?; + + acc1.update_batch(&[data(["e", "b", "d"]), data([","])])?; + acc2.update_batch(&[data(["f", "a", "c"]), data([","])])?; + acc1 = merge(acc1, acc2)?; + + let result = some_str(acc1.evaluate()?); + + assert_eq!(result, "f,e,d,c,b,a"); + + Ok(()) + } + + #[test] + fn duplicates_distinct_sort_asc() -> Result<()> { + let (mut acc1, mut acc2) = StringAggAccumulatorBuilder::new(",") + .distinct() + .order_by_col("col", SortOptions::new(false, false)) + .build_two()?; + + acc1.update_batch(&[data(["a", "c", "b"]), data([","])])?; + acc2.update_batch(&[data(["b", "c", "a"]), data([","])])?; + acc1 = merge(acc1, acc2)?; + + let result = some_str(acc1.evaluate()?); + + assert_eq!(result, "a,b,c"); + + Ok(()) + } + + #[test] + fn duplicates_distinct_sort_desc() -> Result<()> { + let (mut acc1, mut acc2) = StringAggAccumulatorBuilder::new(",") + .distinct() + .order_by_col("col", SortOptions::new(true, false)) + .build_two()?; + + acc1.update_batch(&[data(["a", "c", "b"]), data([","])])?; + acc2.update_batch(&[data(["b", "c", "a"]), data([","])])?; + acc1 = merge(acc1, acc2)?; + + let result = some_str(acc1.evaluate()?); + + assert_eq!(result, "c,b,a"); + + Ok(()) + } + + struct StringAggAccumulatorBuilder { + sep: String, + distinct: bool, + ordering: LexOrdering, + schema: Schema, + } + + impl StringAggAccumulatorBuilder { + fn new(sep: &str) -> Self { + Self { + sep: sep.to_string(), + distinct: Default::default(), + ordering: Default::default(), + schema: Schema { + fields: Fields::from(vec![Field::new( + "col", + DataType::LargeUtf8, + true, + )]), + metadata: Default::default(), + }, + } + } + fn distinct(mut self) -> Self { + self.distinct = true; + self + } + + fn order_by_col(mut self, col: &str, sort_options: SortOptions) -> Self { + self.ordering.extend([PhysicalSortExpr::new( + Arc::new( + Column::new_with_schema(col, &self.schema) + .expect("column not available in schema"), + ), + sort_options, + )]); + self + } + + fn build(&self) -> Result> { + StringAgg::new().accumulator(AccumulatorArgs { + return_type: &DataType::LargeUtf8, + schema: &self.schema, + ignore_nulls: false, + ordering_req: &self.ordering, + is_reversed: false, + name: "", + is_distinct: self.distinct, + exprs: &[ + Arc::new(Column::new("col", 0)), + Arc::new(Literal::new(ScalarValue::Utf8(Some(self.sep.to_string())))), + ], + }) + } + + fn build_two(&self) -> Result<(Box, Box)> { + Ok((self.build()?, self.build()?)) + } + } + + fn some_str(value: ScalarValue) -> String { + str(value) + .expect("ScalarValue was not a String") + .expect("ScalarValue was None") + } + + fn some_str_sorted(value: ScalarValue, sep: &str) -> String { + let value = some_str(value); + let mut parts: Vec<&str> = value.split(sep).collect(); + parts.sort(); + parts.join(sep) + } + + fn str(value: ScalarValue) -> Result> { + match value { + ScalarValue::LargeUtf8(v) => Ok(v), + _ => internal_err!( + "Expected ScalarValue::LargeUtf8, got {}", + value.data_type() + ), + } + } + + fn data(list: [&str; N]) -> ArrayRef { + Arc::new(LargeStringArray::from(list.to_vec())) + } + + fn merge( + mut acc1: Box, + mut acc2: Box, + ) -> Result> { + let intermediate_state = acc2.state().and_then(|e| { + e.iter() + .map(|v| v.to_array()) + .collect::>>() + })?; + acc1.merge_batch(&intermediate_state)?; + Ok(acc1) } } diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs index 468d3d548bcf..1f99cc3a5f0b 100644 --- a/datafusion/functions/benches/regx.rs +++ b/datafusion/functions/benches/regx.rs @@ -18,7 +18,7 @@ extern crate criterion; use arrow::array::builder::StringBuilder; -use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray}; +use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray, StringViewArray}; use arrow::compute::cast; use arrow::datatypes::DataType; use criterion::{black_box, criterion_group, criterion_main, Criterion}; @@ -141,6 +141,20 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); + c.bench_function("regexp_like_1000 utf8view", |b| { + let mut rng = rand::thread_rng(); + let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap(); + let regex = cast(®ex(&mut rng), &DataType::Utf8View).unwrap(); + let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap(); + + b.iter(|| { + black_box( + regexp_like(&[Arc::clone(&data), Arc::clone(®ex), Arc::clone(&flags)]) + .expect("regexp_like should work on valid values"), + ) + }) + }); + c.bench_function("regexp_match_1000", |b| { let mut rng = rand::thread_rng(); let data = Arc::new(data(&mut rng)) as ArrayRef; @@ -149,7 +163,25 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - regexp_match::(&[ + regexp_match(&[ + Arc::clone(&data), + Arc::clone(®ex), + Arc::clone(&flags), + ]) + .expect("regexp_match should work on valid values"), + ) + }) + }); + + c.bench_function("regexp_match_1000 utf8view", |b| { + let mut rng = rand::thread_rng(); + let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap(); + let regex = cast(®ex(&mut rng), &DataType::Utf8View).unwrap(); + let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap(); + + b.iter(|| { + black_box( + regexp_match(&[ Arc::clone(&data), Arc::clone(®ex), Arc::clone(&flags), @@ -180,6 +212,29 @@ fn criterion_benchmark(c: &mut Criterion) { ) }) }); + + c.bench_function("regexp_replace_1000 utf8view", |b| { + let mut rng = rand::thread_rng(); + let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap(); + let regex = cast(®ex(&mut rng), &DataType::Utf8View).unwrap(); + // flags are not allowed to be utf8view according to the function + let flags = Arc::new(flags(&mut rng)) as ArrayRef; + let replacement = Arc::new(StringViewArray::from_iter_values( + iter::repeat("XX").take(1000), + )); + + b.iter(|| { + black_box( + regexp_replace::( + data.as_string_view(), + regex.as_string_view(), + &replacement, + Some(&flags), + ) + .expect("regexp_replace should work on valid values"), + ) + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 06b9a9d98b47..57207ecfdacd 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -16,16 +16,14 @@ // under the License. //! Regex expressions -use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow::array::{Array, ArrayRef, AsArray}; use arrow::compute::kernels::regexp; use arrow::datatypes::DataType; use arrow::datatypes::Field; use datafusion_common::exec_err; use datafusion_common::ScalarValue; use datafusion_common::{arrow_datafusion_err, plan_err}; -use datafusion_common::{ - cast::as_generic_string_array, internal_err, DataFusionError, Result, -}; +use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{ColumnarValue, Documentation, TypeSignature}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; @@ -86,11 +84,12 @@ impl RegexpMatchFunc { signature: Signature::one_of( vec![ // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8, Utf8)`. - // If that fails, it proceeds to `(LargeUtf8, Utf8)`. - // TODO: Native support Utf8View for regexp_match. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. + TypeSignature::Exact(vec![Utf8View, Utf8View]), TypeSignature::Exact(vec![Utf8, Utf8]), TypeSignature::Exact(vec![LargeUtf8, LargeUtf8]), + TypeSignature::Exact(vec![Utf8View, Utf8View, Utf8View]), TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], @@ -138,7 +137,7 @@ impl ScalarUDFImpl for RegexpMatchFunc { .map(|arg| arg.to_array(inferred_length)) .collect::>>()?; - let result = regexp_match_func(&args); + let result = regexp_match(&args); if is_scalar { // If all inputs are scalar, keeps output as scalar let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); @@ -153,33 +152,35 @@ impl ScalarUDFImpl for RegexpMatchFunc { } } -fn regexp_match_func(args: &[ArrayRef]) -> Result { - match args[0].data_type() { - DataType::Utf8 => regexp_match::(args), - DataType::LargeUtf8 => regexp_match::(args), - other => { - internal_err!("Unsupported data type {other:?} for function regexp_match") - } - } -} -pub fn regexp_match(args: &[ArrayRef]) -> Result { +pub fn regexp_match(args: &[ArrayRef]) -> Result { match args.len() { 2 => { - let values = as_generic_string_array::(&args[0])?; - let regex = as_generic_string_array::(&args[1])?; - regexp::regexp_match(values, regex, None) + regexp::regexp_match(&args[0], &args[1], None) .map_err(|e| arrow_datafusion_err!(e)) } 3 => { - let values = as_generic_string_array::(&args[0])?; - let regex = as_generic_string_array::(&args[1])?; - let flags = as_generic_string_array::(&args[2])?; - - if flags.iter().any(|s| s == Some("g")) { - return plan_err!("regexp_match() does not support the \"global\" option"); + match args[2].data_type() { + DataType::Utf8View => { + if args[2].as_string_view().iter().any(|s| s == Some("g")) { + return plan_err!("regexp_match() does not support the \"global\" option"); + } + } + DataType::Utf8 => { + if args[2].as_string::().iter().any(|s| s == Some("g")) { + return plan_err!("regexp_match() does not support the \"global\" option"); + } + } + DataType::LargeUtf8 => { + if args[2].as_string::().iter().any(|s| s == Some("g")) { + return plan_err!("regexp_match() does not support the \"global\" option"); + } + } + e => { + return plan_err!("regexp_match was called with unexpected data type {e:?}"); + } } - regexp::regexp_match(values, regex, Some(flags)) + regexp::regexp_match(&args[0], &args[1], Some(&args[2])) .map_err(|e| arrow_datafusion_err!(e)) } other => exec_err!( @@ -211,7 +212,7 @@ mod tests { expected_builder.append(false); let expected = expected_builder.finish(); - let re = regexp_match::(&[Arc::new(values), Arc::new(patterns)]).unwrap(); + let re = regexp_match(&[Arc::new(values), Arc::new(patterns)]).unwrap(); assert_eq!(re.as_ref(), &expected); } @@ -236,9 +237,8 @@ mod tests { expected_builder.append(false); let expected = expected_builder.finish(); - let re = - regexp_match::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) - .unwrap(); + let re = regexp_match(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) + .unwrap(); assert_eq!(re.as_ref(), &expected); } @@ -250,7 +250,7 @@ mod tests { let flags = StringArray::from(vec!["g"]); let re_err = - regexp_match::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) + regexp_match(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) .expect_err("unsupported flag should have failed"); assert_eq!(re_err.strip_backtrace(), "Error during planning: regexp_match() does not support the \"global\" option"); diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 48a5e2f9a07c..7a41f54c56e1 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -190,7 +190,15 @@ impl<'a> TypeCoercionRewriter<'a> { .map(|(lhs, rhs)| { // coerce the arguments as though they were a single binary equality // expression - let (lhs, rhs) = self.coerce_binary_op(lhs, Operator::Eq, rhs)?; + let left_schema = join.left.schema(); + let right_schema = join.right.schema(); + let (lhs, rhs) = self.coerce_binary_op( + lhs, + left_schema, + Operator::Eq, + rhs, + right_schema, + )?; Ok((lhs, rhs)) }) .collect::>>()?; @@ -275,17 +283,19 @@ impl<'a> TypeCoercionRewriter<'a> { fn coerce_binary_op( &self, left: Expr, + left_schema: &DFSchema, op: Operator, right: Expr, + right_schema: &DFSchema, ) -> Result<(Expr, Expr)> { let (left_type, right_type) = get_input_types( - &left.get_type(self.schema)?, + &left.get_type(left_schema)?, &op, - &right.get_type(self.schema)?, + &right.get_type(right_schema)?, )?; Ok(( - left.cast_to(&left_type, self.schema)?, - right.cast_to(&right_type, self.schema)?, + left.cast_to(&left_type, left_schema)?, + right.cast_to(&right_type, right_schema)?, )) } } @@ -404,7 +414,8 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> { )))) } Expr::BinaryExpr(BinaryExpr { left, op, right }) => { - let (left, right) = self.coerce_binary_op(*left, op, *right)?; + let (left, right) = + self.coerce_binary_op(*left, self.schema, op, *right, self.schema)?; Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr::new( Box::new(left), op, diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 892d450ba85b..e2b8a966cb92 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -26,7 +26,8 @@ use crate::{OptimizerConfig, OptimizerRule}; use crate::utils::NamePreserver; use arrow::datatypes::{ - DataType, TimeUnit, MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION, + DataType, TimeUnit, MAX_DECIMAL128_FOR_EACH_PRECISION, + MIN_DECIMAL128_FOR_EACH_PRECISION, }; use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS}; use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; @@ -369,8 +370,8 @@ fn try_cast_numeric_literal( // Different precision for decimal128 can store different range of value. // For example, the precision is 3, the max of value is `999` and the min // value is `-999` - MIN_DECIMAL_FOR_EACH_PRECISION[*precision as usize - 1], - MAX_DECIMAL_FOR_EACH_PRECISION[*precision as usize - 1], + MIN_DECIMAL128_FOR_EACH_PRECISION[*precision as usize], + MAX_DECIMAL128_FOR_EACH_PRECISION[*precision as usize], ), _ => return None, }; diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index c8c544785d6d..ad8584196c68 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -176,6 +176,17 @@ FROM ---- [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8] +# array agg can use order by with distinct +query ? +SELECT array_agg(DISTINCT c13 ORDER BY c13) +FROM + (SELECT * + FROM aggregate_test_100 + ORDER BY c13 + LIMIT 5) as t1 +---- +[0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8] + statement ok CREATE EXTERNAL TABLE agg_order ( c1 INT NOT NULL, @@ -234,6 +245,11 @@ select column1, nth_value(column3, 2 order by column2, column4 desc) from array_ b [4, 5, 6] w [9, 5, 2] +query ? +select array_agg(DISTINCT column2 order by column2 desc) from array_agg_order_list_table; +---- +[2, 1] + statement ok drop table array_agg_order_list_table; @@ -5532,6 +5548,11 @@ SELECT STRING_AGG(column1, '|') FROM (values (''), (null), ('')); ---- | +query T +SELECT STRING_AGG(DISTINCT column1, '|') FROM (values (''), (null), ('')); +---- +(empty) + statement ok CREATE TABLE strings(g INTEGER, x VARCHAR, y VARCHAR) @@ -5553,6 +5574,16 @@ SELECT STRING_AGG(x,',') FROM strings WHERE g > 100 ---- NULL +query T +SELECT STRING_AGG(DISTINCT x,',') FROM strings WHERE g > 100 +---- +NULL + +query T +SELECT STRING_AGG(DISTINCT x,'|' ORDER BY x) FROM strings +---- +a|b|i|j|p|x|y|z + statement ok drop table strings @@ -5567,6 +5598,17 @@ FROM my_data ---- text1, text1, text1 +query T +WITH my_data as ( +SELECT 'text1'::varchar(1000) as my_column union all +SELECT 'text1'::varchar(1000) as my_column union all +SELECT 'text1'::varchar(1000) as my_column +) +SELECT string_agg(DISTINCT my_column,', ') as my_string_agg +FROM my_data +---- +text1 + query T WITH my_data as ( SELECT 1 as dummy, 'text1'::varchar(1000) as my_column union all @@ -5579,6 +5621,18 @@ GROUP BY dummy ---- text1, text1, text1 +query T +WITH my_data as ( +SELECT 1 as dummy, 'text1'::varchar(1000) as my_column union all +SELECT 1 as dummy, 'text1'::varchar(1000) as my_column union all +SELECT 1 as dummy, 'text1'::varchar(1000) as my_column +) +SELECT string_agg(DISTINCT my_column,', ') as my_string_agg +FROM my_data +GROUP BY dummy +---- +text1 + # Tests for aggregating with NaN values statement ok CREATE TABLE float_table ( diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index 46e9c86c7591..8e470fe988d3 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -416,5 +416,58 @@ SELECT end FROM t; +statement ok +drop table t + +# Fix coercion of lists of structs +# https://github.com/apache/datafusion/issues/14154 + +statement ok +create or replace table t as values +( + 100, -- column1 int (so the case isn't constant folded) + [{ 'foo': arrow_cast('baz', 'Utf8View') }], -- column2 has List of Struct w/ Utf8View + [{ 'foo': 'bar' }], -- column3 has List of Struct w/ Utf8 + [{ 'foo': 'blarg' }] -- column4 has List of Struct w/ Utf8 +); + +# This case forces all branches to be coerced to the same type +query ? +SELECT + case + when column1 > 0 then column2 + when column1 < 0 then column3 + else column4 + end +FROM t; +---- +[{foo: baz}] + +# different orders of the branches +query ? +SELECT + case + when column1 > 0 then column3 -- NB different order + when column1 < 0 then column4 + else column2 + end +FROM t; +---- +[{foo: bar}] + +# different orders of the branches +query ? +SELECT + case + when column1 > 0 then column4 -- NB different order + when column1 < 0 then column2 + else column3 + end +FROM t; +---- +[{foo: blarg}] + + + statement ok drop table t diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 4653df400080..2ba6858d2710 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -707,6 +707,15 @@ datafusion public string_agg 1 OUT NULL LargeUtf8 NULL false 1 datafusion public string_agg 1 IN expression LargeUtf8 NULL false 2 datafusion public string_agg 2 IN delimiter Null NULL false 2 datafusion public string_agg 1 OUT NULL LargeUtf8 NULL false 2 +datafusion public string_agg 1 IN expression Utf8 NULL false 3 +datafusion public string_agg 2 IN delimiter Utf8 NULL false 3 +datafusion public string_agg 1 OUT NULL LargeUtf8 NULL false 3 +datafusion public string_agg 1 IN expression Utf8 NULL false 4 +datafusion public string_agg 2 IN delimiter LargeUtf8 NULL false 4 +datafusion public string_agg 1 OUT NULL LargeUtf8 NULL false 4 +datafusion public string_agg 1 IN expression Utf8 NULL false 5 +datafusion public string_agg 2 IN delimiter Null NULL false 5 +datafusion public string_agg 1 OUT NULL LargeUtf8 NULL false 5 # test variable length arguments query TTTBI rowsort diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index 800026dd766d..80f94e21d1fe 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -193,6 +193,29 @@ NULL [Köln] [إسرائيل] +# test string view +statement ok +CREATE TABLE t_stringview AS +SELECT arrow_cast(str, 'Utf8View') as str, arrow_cast(pattern, 'Utf8View') as pattern, arrow_cast(flags, 'Utf8View') as flags FROM t; + +query ? +SELECT regexp_match(str, pattern, flags) FROM t_stringview; +---- +[a] +[A] +[B] +NULL +NULL +NULL +[010] +[Düsseldorf] +[Москва] +[Köln] +[إسرائيل] + +statement ok +DROP TABLE t_stringview; + query ? SELECT regexp_match('foobarbequebaz', ''); ---- @@ -354,6 +377,29 @@ X X X +# test string view +statement ok +CREATE TABLE t_stringview AS +SELECT arrow_cast(str, 'Utf8View') as str, arrow_cast(pattern, 'Utf8View') as pattern, arrow_cast(flags, 'Utf8View') as flags FROM t; + +query T +SELECT regexp_replace(str, pattern, 'X', concat('g', flags)) FROM t_stringview; +---- +Xbc +X +aXc +AbC +aBC +4000 +X +X +X +X +X + +statement ok +DROP TABLE t_stringview; + query T SELECT regexp_replace('ABCabcABC', '(abc)', 'X', 'gi'); ---- @@ -621,7 +667,7 @@ CREATE TABLE t_stringview AS SELECT arrow_cast(str, 'Utf8View') as str, arrow_cast(pattern, 'Utf8View') as pattern, arrow_cast(start, 'Int64') as start, arrow_cast(flags, 'Utf8View') as flags FROM t; query I -SELECT regexp_count(str, '\w') from t; +SELECT regexp_count(str, '\w') from t_stringview; ---- 3 3 @@ -636,7 +682,7 @@ SELECT regexp_count(str, '\w') from t; 7 query I -SELECT regexp_count(str, '\w{2}', start) from t; +SELECT regexp_count(str, '\w{2}', start) from t_stringview; ---- 1 1 @@ -651,7 +697,7 @@ SELECT regexp_count(str, '\w{2}', start) from t; 3 query I -SELECT regexp_count(str, 'ab', 1, 'i') from t; +SELECT regexp_count(str, 'ab', 1, 'i') from t_stringview; ---- 1 1 @@ -667,7 +713,7 @@ SELECT regexp_count(str, 'ab', 1, 'i') from t; query I -SELECT regexp_count(str, pattern) from t; +SELECT regexp_count(str, pattern) from t_stringview; ---- 1 1 @@ -682,7 +728,7 @@ SELECT regexp_count(str, pattern) from t; 1 query I -SELECT regexp_count(str, pattern, start) from t; +SELECT regexp_count(str, pattern, start) from t_stringview; ---- 1 1 @@ -697,7 +743,7 @@ SELECT regexp_count(str, pattern, start) from t; 1 query I -SELECT regexp_count(str, pattern, start, flags) from t; +SELECT regexp_count(str, pattern, start, flags) from t_stringview; ---- 1 1 @@ -713,7 +759,7 @@ SELECT regexp_count(str, pattern, start, flags) from t; # test type coercion query I -SELECT regexp_count(arrow_cast(str, 'Utf8'), arrow_cast(pattern, 'LargeUtf8'), arrow_cast(start, 'Int32'), flags) from t; +SELECT regexp_count(arrow_cast(str, 'Utf8'), arrow_cast(pattern, 'LargeUtf8'), arrow_cast(start, 'Int32'), flags) from t_stringview; ---- 1 1 diff --git a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt index 01071f03dce6..2f12e9c7a39b 100644 --- a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt +++ b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt @@ -34,6 +34,15 @@ statement ok create table test_substr as select arrow_cast(col1, 'Dictionary(Int32, Utf8)') as c1 from test_substr_base; +statement ok +create table test_datetime as +select + arrow_cast(column1, 'Dictionary(Int32, Utf8)') as ts, + arrow_cast(column2, 'Dictionary(Int32, Utf8)') as d, + arrow_cast(column3, 'Dictionary(Int32, Utf8)') as t +from test_datetime_base; + + statement ok drop table test_source @@ -56,3 +65,6 @@ drop table test_basic_operator; statement ok drop table test_substr_base; + +statement ok +drop table test_datetime_base; \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/string/init_data.slt.part b/datafusion/sqllogictest/test_files/string/init_data.slt.part index 06b65ff8e72a..7799dd605b90 100644 --- a/datafusion/sqllogictest/test_files/string/init_data.slt.part +++ b/datafusion/sqllogictest/test_files/string/init_data.slt.part @@ -37,3 +37,14 @@ statement ok create table test_substr_base ( col1 VARCHAR ) as values ('foo'), ('hello🌏世界'), ('💩'), ('ThisIsAVeryLongASCIIString'), (''), (NULL); + + +# -------------------------------------- +# Setup test tables with date/time values to test coercion +# -------------------------------------- +statement ok +create table test_datetime_base as values + ('2024-08-09T12:13:14', '2024-08-09', '12:13:14'), + ('2024-08-09T12:13:15', '2024-09-09', '12:14:14'), + (NULL, NULL, NULL) +; diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt b/datafusion/sqllogictest/test_files/string/large_string.slt index 84f1e8382e53..93ec796ec6f0 100644 --- a/datafusion/sqllogictest/test_files/string/large_string.slt +++ b/datafusion/sqllogictest/test_files/string/large_string.slt @@ -34,6 +34,15 @@ statement ok create table test_substr as select arrow_cast(col1, 'LargeUtf8') as c1 from test_substr_base; +statement ok +create table test_datetime as +select + arrow_cast(column1, 'LargeUtf8') as ts, + arrow_cast(column2, 'LargeUtf8') as d, + arrow_cast(column3, 'LargeUtf8') as t +from test_datetime_base; + + # select query TTTT SELECT ascii_1, ascii_2, unicode_1, unicode_2 FROM test_basic_operator @@ -64,3 +73,6 @@ drop table test_basic_operator; statement ok drop table test_substr_base; + +statement ok +drop table test_datetime_base; \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 55f0c034f5f9..d724e672e0fc 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -34,6 +34,13 @@ statement ok create table test_substr as select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; +statement ok +create table test_datetime as +select + arrow_cast(column1, 'Utf8') as ts, + arrow_cast(column2, 'Utf8') as d, + arrow_cast(column3, 'Utf8') as t +from test_datetime_base; # @@ -186,3 +193,6 @@ drop table test_basic_operator; statement ok drop table test_substr; + +statement ok +drop table test_datetime; diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 2414e5864c99..a2806859b5ba 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -19,6 +19,10 @@ # with standard values, but different types in string columns # (String, StringView, etc.) +# -------------------------------------- +# Show the input data +# -------------------------------------- + # select query TTTT SELECT ascii_1, ascii_2, unicode_1, unicode_2 FROM test_basic_operator @@ -35,6 +39,49 @@ _ \_ (empty) (empty) NULL % NULL NULL NULL R NULL 🔥 +# -------------------------------------- +# test type coercion (compare to int) +# queries should not error +# -------------------------------------- + +query BB +select ascii_1 = 1 as col1, 1 = ascii_1 as col2 from test_basic_operator; +---- +false false +false false +false false +false false +false false +false false +false false +false false +false false +NULL NULL +NULL NULL + +query BB +select ascii_1 <> 1 as col1, 1 <> ascii_1 as col2 from test_basic_operator; +---- +true true +true true +true true +true true +true true +true true +true true +true true +true true +NULL NULL +NULL NULL + +# Coercion to date/time +query BBB +select ts = '2024-08-09T12:13:14'::timestamp, d = '2024-08-08'::date, t = '12:13:14'::time from test_datetime; +---- +true false true +false false false +NULL NULL NULL + # -------------------------------------- # column comparison as filters # -------------------------------------- diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 435b4bc3c5a8..c54e2aa7002c 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -34,6 +34,14 @@ statement ok create table test_substr as select arrow_cast(col1, 'Utf8View') as c1 from test_substr_base; +statement ok +create table test_datetime as +select + arrow_cast(column1, 'Utf8View') as ts, + arrow_cast(column2, 'Utf8View') as d, + arrow_cast(column3, 'Utf8View') as t +from test_datetime_base; + statement ok drop table test_source @@ -51,6 +59,9 @@ drop table test_basic_operator; statement ok drop table test_substr_base; +statement ok +drop table test_datetime_base; + # -------------------------------------- # String_view specific tests @@ -783,7 +794,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: regexp_match(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k +01)Projection: regexp_match(test.column1_utf8view, Utf8View("^https?://(?:www\.)?([^/]+)/.*$")) AS k 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for REGEXP_REPLACE diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index d671798b7d0f..0afe39de1795 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -459,14 +459,14 @@ create table t as values({r: 'a', c: 1}), ({r: 'b', c: 2.3}); query ? select * from t; ---- -{c0: a, c1: 1.0} -{c0: b, c1: 2.3} +{r: a, c: 1.0} +{r: b, c: 2.3} query T select arrow_typeof(column1) from t; ---- -Struct([Field { name: "c0", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]) -Struct([Field { name: "c0", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]) +Struct([Field { name: "r", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]) +Struct([Field { name: "r", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]) statement ok drop table t; diff --git a/dev/changelog/45.0.0.md b/dev/changelog/45.0.0.md index 2303eee92a1d..ca905c0a1a56 100644 --- a/dev/changelog/45.0.0.md +++ b/dev/changelog/45.0.0.md @@ -19,7 +19,7 @@ under the License. # Apache DataFusion 45.0.0 Changelog -This release consists of 252 commits from 83 contributors. See credits at the end of this changelog for more information. +This release consists of 258 commits from 83 contributors. See credits at the end of this changelog for more information. **Breaking changes:** @@ -94,6 +94,7 @@ This release consists of 252 commits from 83 contributors. See credits at the en - Support arrays_overlap function (alias of `array_has_any`) [#14217](https://github.com/apache/datafusion/pull/14217) (erenavsarogullari) - chore: Adding commit activity badge [#14386](https://github.com/apache/datafusion/pull/14386) (comphead) - docs: Clarify join behavior in `DataFrame::join` [#14393](https://github.com/apache/datafusion/pull/14393) (rkrishn7) +- Prepare for `45.0.0` release: Version and Changelog [#14397](https://github.com/apache/datafusion/pull/14397) (alamb) **Other:** @@ -290,13 +291,18 @@ This release consists of 252 commits from 83 contributors. See credits at the en - FFI support for versions and alternate tokio runtimes [#13937](https://github.com/apache/datafusion/pull/13937) (timsaucer) - Do not rename struct fields when coercing types in `CASE` [#14384](https://github.com/apache/datafusion/pull/14384) (alamb) - Add `TableProvider::insert_into` into FFI Bindings [#14391](https://github.com/apache/datafusion/pull/14391) (davisp) +- [branch-45]: Backport chore: Upgrade to `arrow`/`parquet` `54.1.0` and fix clippy/ci (#14415) [#14453](https://github.com/apache/datafusion/pull/14453) (alamb) +- [release-45] Fix join type coercion (#14387) [#14454](https://github.com/apache/datafusion/pull/14454) (alamb) +- [branch-45] Support `Utf8View` to `numeric` coercion (#14377) [#14455](https://github.com/apache/datafusion/pull/14455) (alamb) +- [branch-45] Update REGEXP_MATCH scalar function to support Utf8View (#14449) [#14457](https://github.com/apache/datafusion/pull/14457) (alamb) +- [branch-45] Fix regression list Type Coercion List with inner type struct which has large/view types (#14385) [#14456](https://github.com/apache/datafusion/pull/14456) (alamb) ## Credits Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. ``` - 46 Andrew Lamb + 52 Andrew Lamb 22 Ian Lai 20 dependabot[bot] 8 Bruce Ritchie