Skip to content

Commit e35630a

Browse files
committed
Update ExtensionType trait to support more canonical extension types
1 parent 374d017 commit e35630a

File tree

14 files changed

+1467
-333
lines changed

14 files changed

+1467
-333
lines changed

arrow-schema/Cargo.toml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,19 @@ path = "src/lib.rs"
3434
bench = false
3535

3636
[dependencies]
37-
serde = { version = "1.0", default-features = false, features = ["derive", "std", "rc"], optional = true }
37+
serde = { version = "1.0", default-features = false, features = [
38+
"derive",
39+
"std",
40+
"rc",
41+
], optional = true }
3842
bitflags = { version = "2.0.0", default-features = false, optional = true }
39-
serde_json = "1.0"
43+
serde_json = { version = "1.0", optional = true }
4044

4145
[features]
46+
canonical-extension-types = ["dep:serde", "dep:serde_json"]
4247
# Enable ffi support
4348
ffi = ["bitflags"]
49+
serde = ["dep:serde"]
4450

4551
[package.metadata.docs.rs]
4652
features = ["ffi"]

arrow-schema/src/datatype.rs

Lines changed: 0 additions & 293 deletions
Original file line numberDiff line numberDiff line change
@@ -764,299 +764,6 @@ impl DataType {
764764
}
765765
}
766766

767-
/// The metadata key for the string name identifying the custom data type.
768-
pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";
769-
770-
/// The metadata key for a serialized representation of the ExtensionType
771-
/// necessary to reconstruct the custom type.
772-
pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";
773-
774-
/// Extension types.
775-
///
776-
/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types>
777-
pub trait ExtensionType: Sized {
778-
/// The name of this extension type.
779-
const NAME: &'static str;
780-
781-
/// The supported storage types of this extension type.
782-
fn storage_types(&self) -> &[DataType];
783-
784-
/// The metadata type of this extension type.
785-
type Metadata;
786-
787-
/// Returns a reference to the metadata of this extension type, or `None`
788-
/// if this extension type has no metadata.
789-
fn metadata(&self) -> Option<&Self::Metadata>;
790-
791-
/// Returns the serialized representation of the metadata of this extension
792-
/// type, or `None` if this extension type has no metadata.
793-
fn serialized_metadata(&self) -> Option<String>;
794-
795-
/// Deserialize this extension type from the serialized representation of the
796-
/// metadata of this extension. An extension type that has no metadata should
797-
/// expect `None` for for the serialized metadata.
798-
fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option<Self>;
799-
}
800-
801-
pub(crate) trait ExtensionTypeExt: ExtensionType {
802-
/// Returns `true` if the given data type is supported by this extension
803-
/// type.
804-
fn supports(&self, data_type: &DataType) -> bool {
805-
self.storage_types().contains(data_type)
806-
}
807-
808-
/// Try to extract this extension type from the given [`Field`].
809-
///
810-
/// This function returns `None` if extension type
811-
/// - information is missing
812-
/// - name does not match
813-
/// - metadata deserialization failed
814-
/// - does not support the data type of this field
815-
fn try_from_field(field: &Field) -> Option<Self> {
816-
field
817-
.metadata()
818-
.get(EXTENSION_TYPE_NAME_KEY)
819-
.and_then(|name| {
820-
(name == <Self as ExtensionType>::NAME)
821-
.then(|| {
822-
Self::from_serialized_metadata(
823-
field
824-
.metadata()
825-
.get(EXTENSION_TYPE_METADATA_KEY)
826-
.map(String::as_str),
827-
)
828-
})
829-
.flatten()
830-
})
831-
.filter(|extension_type| extension_type.supports(field.data_type()))
832-
}
833-
}
834-
835-
impl<T> ExtensionTypeExt for T where T: ExtensionType {}
836-
837-
/// Canonical extension types.
838-
///
839-
/// The Arrow columnar format allows defining extension types so as to extend
840-
/// standard Arrow data types with custom semantics. Often these semantics will
841-
/// be specific to a system or application. However, it is beneficial to share
842-
/// the definitions of well-known extension types so as to improve
843-
/// interoperability between different systems integrating Arrow columnar data.
844-
pub mod canonical_extension_types {
845-
use serde_json::Value;
846-
847-
use super::{DataType, ExtensionType};
848-
849-
/// Canonical extension types.
850-
#[non_exhaustive]
851-
#[derive(Debug, Clone, PartialEq)]
852-
pub enum CanonicalExtensionTypes {
853-
/// The extension type for 'JSON'.
854-
Json(Json),
855-
/// The extension type for `UUID`.
856-
Uuid(Uuid),
857-
}
858-
859-
impl From<Json> for CanonicalExtensionTypes {
860-
fn from(value: Json) -> Self {
861-
CanonicalExtensionTypes::Json(value)
862-
}
863-
}
864-
865-
impl From<Uuid> for CanonicalExtensionTypes {
866-
fn from(value: Uuid) -> Self {
867-
CanonicalExtensionTypes::Uuid(value)
868-
}
869-
}
870-
871-
/// The extension type for `JSON`.
872-
///
873-
/// Extension name: `arrow.json`.
874-
///
875-
/// The storage type of this extension is `String` or `LargeString` or
876-
/// `StringView`. Only UTF-8 encoded JSON as specified in [rfc8259](https://datatracker.ietf.org/doc/html/rfc8259)
877-
/// is supported.
878-
///
879-
/// This type does not have any parameters.
880-
///
881-
/// Metadata is either an empty string or a JSON string with an empty
882-
/// object. In the future, additional fields may be added, but they are not
883-
/// required to interpret the array.
884-
///
885-
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#json>
886-
#[derive(Debug, Clone, PartialEq)]
887-
pub struct Json(Value);
888-
889-
impl Default for Json {
890-
fn default() -> Self {
891-
Self(Value::String("".to_owned()))
892-
}
893-
}
894-
895-
impl ExtensionType for Json {
896-
const NAME: &'static str = "arrow.json";
897-
898-
type Metadata = Value;
899-
900-
fn storage_types(&self) -> &[DataType] {
901-
&[DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View]
902-
}
903-
904-
fn metadata(&self) -> Option<&Self::Metadata> {
905-
Some(&self.0)
906-
}
907-
908-
fn serialized_metadata(&self) -> Option<String> {
909-
Some(self.0.to_string())
910-
}
911-
912-
fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option<Self> {
913-
serialized_metadata.and_then(|metadata| match metadata {
914-
// Empty string
915-
r#""""# => Some(Default::default()),
916-
// Empty object
917-
value => value
918-
.parse::<Value>()
919-
.ok()
920-
.filter(|value| matches!(value.as_object(), Some(map) if map.is_empty()))
921-
.map(Self),
922-
})
923-
}
924-
}
925-
926-
/// The extension type for `UUID`.
927-
///
928-
/// Extension name: `arrow.uuid`.
929-
///
930-
/// The storage type of the extension is `FixedSizeBinary` with a length of
931-
/// 16 bytes.
932-
///
933-
/// Note:
934-
/// A specific UUID version is not required or guaranteed. This extension
935-
/// represents UUIDs as `FixedSizeBinary(16)` with big-endian notation and
936-
/// does not interpret the bytes in any way.
937-
///
938-
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid>
939-
#[derive(Debug, Default, Clone, Copy, PartialEq)]
940-
pub struct Uuid;
941-
942-
impl ExtensionType for Uuid {
943-
const NAME: &'static str = "arrow.uuid";
944-
945-
type Metadata = ();
946-
947-
fn storage_types(&self) -> &[DataType] {
948-
&[DataType::FixedSizeBinary(16)]
949-
}
950-
951-
fn metadata(&self) -> Option<&Self::Metadata> {
952-
None
953-
}
954-
955-
fn serialized_metadata(&self) -> Option<String> {
956-
None
957-
}
958-
959-
fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option<Self> {
960-
serialized_metadata.is_none().then_some(Self)
961-
}
962-
}
963-
964-
#[cfg(test)]
965-
mod tests {
966-
use std::collections::HashMap;
967-
968-
use serde_json::Map;
969-
970-
use crate::{ArrowError, Field, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY};
971-
972-
use super::*;
973-
974-
#[test]
975-
fn json() -> Result<(), ArrowError> {
976-
let mut field = Field::new("", DataType::Utf8, false);
977-
field.try_with_extension_type(Json::default())?;
978-
assert_eq!(
979-
field.metadata().get(EXTENSION_TYPE_METADATA_KEY),
980-
Some(&r#""""#.to_owned())
981-
);
982-
assert!(field.extension_type::<Json>().is_some());
983-
984-
let mut field = Field::new("", DataType::LargeUtf8, false);
985-
field.try_with_extension_type(Json(serde_json::Value::Object(Map::default())))?;
986-
assert_eq!(
987-
field.metadata().get(EXTENSION_TYPE_METADATA_KEY),
988-
Some(&"{}".to_owned())
989-
);
990-
assert!(field.extension_type::<Json>().is_some());
991-
992-
let mut field = Field::new("", DataType::Utf8View, false);
993-
field.try_with_extension_type(Json::default())?;
994-
assert!(field.extension_type::<Json>().is_some());
995-
assert_eq!(
996-
field.canonical_extension_type(),
997-
Some(CanonicalExtensionTypes::Json(Json::default()))
998-
);
999-
Ok(())
1000-
}
1001-
1002-
#[test]
1003-
#[should_panic(expected = "expected Utf8 or LargeUtf8 or Utf8View, found Boolean")]
1004-
fn json_bad_type() {
1005-
Field::new("", DataType::Boolean, false).with_extension_type(Json::default());
1006-
}
1007-
1008-
#[test]
1009-
fn json_bad_metadata() {
1010-
let field = Field::new("", DataType::Utf8, false).with_metadata(HashMap::from_iter([
1011-
(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()),
1012-
(EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()),
1013-
]));
1014-
// This returns `None` now because this metadata is invalid.
1015-
assert!(field.extension_type::<Json>().is_none());
1016-
}
1017-
1018-
#[test]
1019-
fn json_missing_metadata() {
1020-
let field = Field::new("", DataType::LargeUtf8, false).with_metadata(
1021-
HashMap::from_iter([(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())]),
1022-
);
1023-
// This returns `None` now because the metadata is missing.
1024-
assert!(field.extension_type::<Json>().is_none());
1025-
}
1026-
1027-
#[test]
1028-
fn uuid() -> Result<(), ArrowError> {
1029-
let mut field = Field::new("", DataType::FixedSizeBinary(16), false);
1030-
field.try_with_extension_type(Uuid)?;
1031-
assert!(field.extension_type::<Uuid>().is_some());
1032-
assert_eq!(
1033-
field.canonical_extension_type(),
1034-
Some(CanonicalExtensionTypes::Uuid(Uuid))
1035-
);
1036-
Ok(())
1037-
}
1038-
1039-
#[test]
1040-
#[should_panic(expected = "expected FixedSizeBinary(16), found FixedSizeBinary(8)")]
1041-
fn uuid_bad_type() {
1042-
Field::new("", DataType::FixedSizeBinary(8), false).with_extension_type(Uuid);
1043-
}
1044-
1045-
#[test]
1046-
fn uuid_with_metadata() {
1047-
// Add metadata that's not expected for uuid.
1048-
let field = Field::new("", DataType::FixedSizeBinary(16), false)
1049-
.with_metadata(HashMap::from_iter([(
1050-
EXTENSION_TYPE_METADATA_KEY.to_owned(),
1051-
"".to_owned(),
1052-
)]))
1053-
.with_extension_type(Uuid);
1054-
// This returns `None` now because `Uuid` expects no metadata.
1055-
assert!(field.extension_type::<Uuid>().is_none());
1056-
}
1057-
}
1058-
}
1059-
1060767
/// The maximum precision for [DataType::Decimal128] values
1061768
pub const DECIMAL128_MAX_PRECISION: u8 = 38;
1062769

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! 8-bit Boolean
19+
//!
20+
//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
21+
22+
use crate::{extension::ExtensionType, ArrowError, DataType};
23+
24+
/// The extension type for `8-bit Boolean`.
25+
///
26+
/// Extension name: `arrow.bool8`.
27+
///
28+
/// The storage type of the extension is `Int8` where:
29+
/// - false is denoted by the value 0.
30+
/// - true can be specified using any non-zero value. Preferably 1.
31+
///
32+
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
33+
#[derive(Debug, Default, Clone, Copy, PartialEq)]
34+
pub struct Bool8;
35+
36+
impl ExtensionType for Bool8 {
37+
const NAME: &str = "arrow.bool8";
38+
39+
type Metadata = &'static str;
40+
41+
fn metadata(&self) -> &Self::Metadata {
42+
&""
43+
}
44+
45+
fn serialize_metadata(&self) -> Option<String> {
46+
Some(String::default())
47+
}
48+
49+
fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
50+
const ERR: &str = "Bool8 extension type expects an empty string as metadata";
51+
metadata.map_or_else(
52+
|| Err(ArrowError::InvalidArgumentError(ERR.to_owned())),
53+
|value| match value {
54+
"" => Ok(""),
55+
_ => Err(ArrowError::InvalidArgumentError(ERR.to_owned())),
56+
},
57+
)
58+
}
59+
60+
fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
61+
match data_type {
62+
DataType::Int8 => Ok(()),
63+
data_type => Err(ArrowError::InvalidArgumentError(format!(
64+
"Bool8 data type mismatch, expected Int8, found {data_type}"
65+
))),
66+
}
67+
}
68+
69+
fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
70+
Self.supports_data_type(data_type).map(|_| Self)
71+
}
72+
}

0 commit comments

Comments
 (0)