Skip to content

Commit f373a86

Browse files
XiangpengHaoalamb
andauthored
Add initial support for Utf8View and BinaryView types (#10925)
* add view types * Add slt tests * comment out failing test * update vendored code --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 1cb0057 commit f373a86

File tree

10 files changed

+193
-19
lines changed

10 files changed

+193
-19
lines changed

datafusion/common/src/scalar/mod.rs

Lines changed: 78 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,14 @@ pub enum ScalarValue {
221221
UInt64(Option<u64>),
222222
/// utf-8 encoded string.
223223
Utf8(Option<String>),
224+
/// utf-8 encoded string but from view types.
225+
Utf8View(Option<String>),
224226
/// utf-8 encoded string representing a LargeString's arrow type.
225227
LargeUtf8(Option<String>),
226228
/// binary
227229
Binary(Option<Vec<u8>>),
230+
/// binary but from view types.
231+
BinaryView(Option<Vec<u8>>),
228232
/// fixed size binary
229233
FixedSizeBinary(i32, Option<Vec<u8>>),
230234
/// large binary
@@ -345,10 +349,14 @@ impl PartialEq for ScalarValue {
345349
(UInt64(_), _) => false,
346350
(Utf8(v1), Utf8(v2)) => v1.eq(v2),
347351
(Utf8(_), _) => false,
352+
(Utf8View(v1), Utf8View(v2)) => v1.eq(v2),
353+
(Utf8View(_), _) => false,
348354
(LargeUtf8(v1), LargeUtf8(v2)) => v1.eq(v2),
349355
(LargeUtf8(_), _) => false,
350356
(Binary(v1), Binary(v2)) => v1.eq(v2),
351357
(Binary(_), _) => false,
358+
(BinaryView(v1), BinaryView(v2)) => v1.eq(v2),
359+
(BinaryView(_), _) => false,
352360
(FixedSizeBinary(_, v1), FixedSizeBinary(_, v2)) => v1.eq(v2),
353361
(FixedSizeBinary(_, _), _) => false,
354362
(LargeBinary(v1), LargeBinary(v2)) => v1.eq(v2),
@@ -470,8 +478,12 @@ impl PartialOrd for ScalarValue {
470478
(Utf8(_), _) => None,
471479
(LargeUtf8(v1), LargeUtf8(v2)) => v1.partial_cmp(v2),
472480
(LargeUtf8(_), _) => None,
481+
(Utf8View(v1), Utf8View(v2)) => v1.partial_cmp(v2),
482+
(Utf8View(_), _) => None,
473483
(Binary(v1), Binary(v2)) => v1.partial_cmp(v2),
474484
(Binary(_), _) => None,
485+
(BinaryView(v1), BinaryView(v2)) => v1.partial_cmp(v2),
486+
(BinaryView(_), _) => None,
475487
(FixedSizeBinary(_, v1), FixedSizeBinary(_, v2)) => v1.partial_cmp(v2),
476488
(FixedSizeBinary(_, _), _) => None,
477489
(LargeBinary(v1), LargeBinary(v2)) => v1.partial_cmp(v2),
@@ -667,11 +679,10 @@ impl std::hash::Hash for ScalarValue {
667679
UInt16(v) => v.hash(state),
668680
UInt32(v) => v.hash(state),
669681
UInt64(v) => v.hash(state),
670-
Utf8(v) => v.hash(state),
671-
LargeUtf8(v) => v.hash(state),
672-
Binary(v) => v.hash(state),
673-
FixedSizeBinary(_, v) => v.hash(state),
674-
LargeBinary(v) => v.hash(state),
682+
Utf8(v) | LargeUtf8(v) | Utf8View(v) => v.hash(state),
683+
Binary(v) | FixedSizeBinary(_, v) | LargeBinary(v) | BinaryView(v) => {
684+
v.hash(state)
685+
}
675686
List(arr) => {
676687
hash_nested_array(arr.to_owned() as ArrayRef, state);
677688
}
@@ -1107,7 +1118,9 @@ impl ScalarValue {
11071118
ScalarValue::Float64(_) => DataType::Float64,
11081119
ScalarValue::Utf8(_) => DataType::Utf8,
11091120
ScalarValue::LargeUtf8(_) => DataType::LargeUtf8,
1121+
ScalarValue::Utf8View(_) => DataType::Utf8View,
11101122
ScalarValue::Binary(_) => DataType::Binary,
1123+
ScalarValue::BinaryView(_) => DataType::BinaryView,
11111124
ScalarValue::FixedSizeBinary(sz, _) => DataType::FixedSizeBinary(*sz),
11121125
ScalarValue::LargeBinary(_) => DataType::LargeBinary,
11131126
ScalarValue::List(arr) => arr.data_type().to_owned(),
@@ -1310,11 +1323,13 @@ impl ScalarValue {
13101323
ScalarValue::UInt16(v) => v.is_none(),
13111324
ScalarValue::UInt32(v) => v.is_none(),
13121325
ScalarValue::UInt64(v) => v.is_none(),
1313-
ScalarValue::Utf8(v) => v.is_none(),
1314-
ScalarValue::LargeUtf8(v) => v.is_none(),
1315-
ScalarValue::Binary(v) => v.is_none(),
1316-
ScalarValue::FixedSizeBinary(_, v) => v.is_none(),
1317-
ScalarValue::LargeBinary(v) => v.is_none(),
1326+
ScalarValue::Utf8(v)
1327+
| ScalarValue::Utf8View(v)
1328+
| ScalarValue::LargeUtf8(v) => v.is_none(),
1329+
ScalarValue::Binary(v)
1330+
| ScalarValue::BinaryView(v)
1331+
| ScalarValue::FixedSizeBinary(_, v)
1332+
| ScalarValue::LargeBinary(v) => v.is_none(),
13181333
// arr.len() should be 1 for a list scalar, but we don't seem to
13191334
// enforce that anywhere, so we still check against array length.
13201335
ScalarValue::List(arr) => arr.len() == arr.null_count(),
@@ -2002,6 +2017,12 @@ impl ScalarValue {
20022017
}
20032018
None => new_null_array(&DataType::Utf8, size),
20042019
},
2020+
ScalarValue::Utf8View(e) => match e {
2021+
Some(value) => {
2022+
Arc::new(StringViewArray::from_iter_values(repeat(value).take(size)))
2023+
}
2024+
None => new_null_array(&DataType::Utf8View, size),
2025+
},
20052026
ScalarValue::LargeUtf8(e) => match e {
20062027
Some(value) => {
20072028
Arc::new(LargeStringArray::from_iter_values(repeat(value).take(size)))
@@ -2018,6 +2039,16 @@ impl ScalarValue {
20182039
Arc::new(repeat(None::<&str>).take(size).collect::<BinaryArray>())
20192040
}
20202041
},
2042+
ScalarValue::BinaryView(e) => match e {
2043+
Some(value) => Arc::new(
2044+
repeat(Some(value.as_slice()))
2045+
.take(size)
2046+
.collect::<BinaryViewArray>(),
2047+
),
2048+
None => {
2049+
Arc::new(repeat(None::<&str>).take(size).collect::<BinaryViewArray>())
2050+
}
2051+
},
20212052
ScalarValue::FixedSizeBinary(s, e) => match e {
20222053
Some(value) => Arc::new(
20232054
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
@@ -2361,10 +2392,14 @@ impl ScalarValue {
23612392
DataType::LargeBinary => {
23622393
typed_cast!(array, index, LargeBinaryArray, LargeBinary)?
23632394
}
2395+
DataType::BinaryView => {
2396+
typed_cast!(array, index, BinaryViewArray, BinaryView)?
2397+
}
23642398
DataType::Utf8 => typed_cast!(array, index, StringArray, Utf8)?,
23652399
DataType::LargeUtf8 => {
23662400
typed_cast!(array, index, LargeStringArray, LargeUtf8)?
23672401
}
2402+
DataType::Utf8View => typed_cast!(array, index, StringViewArray, Utf8View)?,
23682403
DataType::List(_) => {
23692404
let list_array = array.as_list::<i32>();
23702405
let nested_array = list_array.value(index);
@@ -2652,12 +2687,18 @@ impl ScalarValue {
26522687
ScalarValue::Utf8(val) => {
26532688
eq_array_primitive!(array, index, StringArray, val)?
26542689
}
2690+
ScalarValue::Utf8View(val) => {
2691+
eq_array_primitive!(array, index, StringViewArray, val)?
2692+
}
26552693
ScalarValue::LargeUtf8(val) => {
26562694
eq_array_primitive!(array, index, LargeStringArray, val)?
26572695
}
26582696
ScalarValue::Binary(val) => {
26592697
eq_array_primitive!(array, index, BinaryArray, val)?
26602698
}
2699+
ScalarValue::BinaryView(val) => {
2700+
eq_array_primitive!(array, index, BinaryViewArray, val)?
2701+
}
26612702
ScalarValue::FixedSizeBinary(_, val) => {
26622703
eq_array_primitive!(array, index, FixedSizeBinaryArray, val)?
26632704
}
@@ -2790,7 +2831,9 @@ impl ScalarValue {
27902831
| ScalarValue::DurationMillisecond(_)
27912832
| ScalarValue::DurationMicrosecond(_)
27922833
| ScalarValue::DurationNanosecond(_) => 0,
2793-
ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => {
2834+
ScalarValue::Utf8(s)
2835+
| ScalarValue::LargeUtf8(s)
2836+
| ScalarValue::Utf8View(s) => {
27942837
s.as_ref().map(|s| s.capacity()).unwrap_or_default()
27952838
}
27962839
ScalarValue::TimestampSecond(_, s)
@@ -2801,7 +2844,8 @@ impl ScalarValue {
28012844
}
28022845
ScalarValue::Binary(b)
28032846
| ScalarValue::FixedSizeBinary(_, b)
2804-
| ScalarValue::LargeBinary(b) => {
2847+
| ScalarValue::LargeBinary(b)
2848+
| ScalarValue::BinaryView(b) => {
28052849
b.as_ref().map(|b| b.capacity()).unwrap_or_default()
28062850
}
28072851
ScalarValue::List(arr) => arr.get_array_memory_size(),
@@ -3068,7 +3112,9 @@ impl TryFrom<&DataType> for ScalarValue {
30683112
}
30693113
DataType::Utf8 => ScalarValue::Utf8(None),
30703114
DataType::LargeUtf8 => ScalarValue::LargeUtf8(None),
3115+
DataType::Utf8View => ScalarValue::Utf8View(None),
30713116
DataType::Binary => ScalarValue::Binary(None),
3117+
DataType::BinaryView => ScalarValue::BinaryView(None),
30723118
DataType::FixedSizeBinary(len) => ScalarValue::FixedSizeBinary(*len, None),
30733119
DataType::LargeBinary => ScalarValue::LargeBinary(None),
30743120
DataType::Date32 => ScalarValue::Date32(None),
@@ -3190,11 +3236,13 @@ impl fmt::Display for ScalarValue {
31903236
ScalarValue::TimestampMillisecond(e, _) => format_option!(f, e)?,
31913237
ScalarValue::TimestampMicrosecond(e, _) => format_option!(f, e)?,
31923238
ScalarValue::TimestampNanosecond(e, _) => format_option!(f, e)?,
3193-
ScalarValue::Utf8(e) => format_option!(f, e)?,
3194-
ScalarValue::LargeUtf8(e) => format_option!(f, e)?,
3239+
ScalarValue::Utf8(e)
3240+
| ScalarValue::LargeUtf8(e)
3241+
| ScalarValue::Utf8View(e) => format_option!(f, e)?,
31953242
ScalarValue::Binary(e)
31963243
| ScalarValue::FixedSizeBinary(_, e)
3197-
| ScalarValue::LargeBinary(e) => match e {
3244+
| ScalarValue::LargeBinary(e)
3245+
| ScalarValue::BinaryView(e) => match e {
31983246
Some(l) => write!(
31993247
f,
32003248
"{}",
@@ -3318,10 +3366,14 @@ impl fmt::Debug for ScalarValue {
33183366
}
33193367
ScalarValue::Utf8(None) => write!(f, "Utf8({self})"),
33203368
ScalarValue::Utf8(Some(_)) => write!(f, "Utf8(\"{self}\")"),
3369+
ScalarValue::Utf8View(None) => write!(f, "Utf8View({self})"),
3370+
ScalarValue::Utf8View(Some(_)) => write!(f, "Utf8View(\"{self}\")"),
33213371
ScalarValue::LargeUtf8(None) => write!(f, "LargeUtf8({self})"),
33223372
ScalarValue::LargeUtf8(Some(_)) => write!(f, "LargeUtf8(\"{self}\")"),
33233373
ScalarValue::Binary(None) => write!(f, "Binary({self})"),
33243374
ScalarValue::Binary(Some(_)) => write!(f, "Binary(\"{self}\")"),
3375+
ScalarValue::BinaryView(None) => write!(f, "BinaryView({self})"),
3376+
ScalarValue::BinaryView(Some(_)) => write!(f, "BinaryView(\"{self}\")"),
33253377
ScalarValue::FixedSizeBinary(size, None) => {
33263378
write!(f, "FixedSizeBinary({size}, {self})")
33273379
}
@@ -5393,6 +5445,17 @@ mod tests {
53935445
ScalarValue::Utf8(None),
53945446
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
53955447
);
5448+
5449+
// needs https://github.com/apache/arrow-rs/issues/5893
5450+
/*
5451+
check_scalar_cast(ScalarValue::Utf8(None), DataType::Utf8View);
5452+
check_scalar_cast(ScalarValue::from("foo"), DataType::Utf8View);
5453+
check_scalar_cast(
5454+
ScalarValue::from("larger than 12 bytes string"),
5455+
DataType::Utf8View,
5456+
);
5457+
5458+
*/
53965459
}
53975460

53985461
// mimics how casting work on scalar values by `casting` `scalar` to `desired_type`

datafusion/functions/src/core/arrow_cast.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,9 @@ impl<'a> Tokenizer<'a> {
564564

565565
"Utf8" => Token::SimpleType(DataType::Utf8),
566566
"LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
567+
"Utf8View" => Token::SimpleType(DataType::Utf8View),
567568
"Binary" => Token::SimpleType(DataType::Binary),
569+
"BinaryView" => Token::SimpleType(DataType::BinaryView),
568570
"LargeBinary" => Token::SimpleType(DataType::LargeBinary),
569571

570572
"Float16" => Token::SimpleType(DataType::Float16),
@@ -772,11 +774,13 @@ mod test {
772774
DataType::Interval(IntervalUnit::DayTime),
773775
DataType::Interval(IntervalUnit::MonthDayNano),
774776
DataType::Binary,
777+
DataType::BinaryView,
775778
DataType::FixedSizeBinary(0),
776779
DataType::FixedSizeBinary(1234),
777780
DataType::FixedSizeBinary(-432),
778781
DataType::LargeBinary,
779782
DataType::Utf8,
783+
DataType::Utf8View,
780784
DataType::LargeUtf8,
781785
DataType::Decimal128(7, 12),
782786
DataType::Decimal256(6, 13),

datafusion/proto-common/proto/datafusion_common.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ message ScalarValue{
248248
bool bool_value = 1;
249249
string utf8_value = 2;
250250
string large_utf8_value = 3;
251+
string utf8_view_value = 23;
251252
int32 int8_value = 4;
252253
int32 int16_value = 5;
253254
int32 int32_value = 6;
@@ -281,6 +282,7 @@ message ScalarValue{
281282
ScalarDictionaryValue dictionary_value = 27;
282283
bytes binary_value = 28;
283284
bytes large_binary_value = 29;
285+
bytes binary_view_value = 22;
284286
ScalarTime64Value time64_value = 30;
285287
IntervalDayTimeValue interval_daytime_value = 25;
286288
IntervalMonthDayNanoValue interval_month_day_nano = 31;
@@ -318,8 +320,10 @@ message ArrowType{
318320
EmptyMessage FLOAT32 = 12 ;
319321
EmptyMessage FLOAT64 = 13 ;
320322
EmptyMessage UTF8 = 14 ;
323+
EmptyMessage UTF8_VIEW = 35;
321324
EmptyMessage LARGE_UTF8 = 32;
322325
EmptyMessage BINARY = 15 ;
326+
EmptyMessage BINARY_VIEW = 34;
323327
int32 FIXED_SIZE_BINARY = 16 ;
324328
EmptyMessage LARGE_BINARY = 31;
325329
EmptyMessage DATE32 = 17 ;

datafusion/proto-common/src/from_proto/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,10 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
224224
arrow_type::ArrowTypeEnum::Float32(_) => DataType::Float32,
225225
arrow_type::ArrowTypeEnum::Float64(_) => DataType::Float64,
226226
arrow_type::ArrowTypeEnum::Utf8(_) => DataType::Utf8,
227+
arrow_type::ArrowTypeEnum::Utf8View(_) => DataType::Utf8View,
227228
arrow_type::ArrowTypeEnum::LargeUtf8(_) => DataType::LargeUtf8,
228229
arrow_type::ArrowTypeEnum::Binary(_) => DataType::Binary,
230+
arrow_type::ArrowTypeEnum::BinaryView(_) => DataType::BinaryView,
229231
arrow_type::ArrowTypeEnum::FixedSizeBinary(size) => {
230232
DataType::FixedSizeBinary(*size)
231233
}
@@ -361,6 +363,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
361363
Ok(match value {
362364
Value::BoolValue(v) => Self::Boolean(Some(*v)),
363365
Value::Utf8Value(v) => Self::Utf8(Some(v.to_owned())),
366+
Value::Utf8ViewValue(v) => Self::Utf8View(Some(v.to_owned())),
364367
Value::LargeUtf8Value(v) => Self::LargeUtf8(Some(v.to_owned())),
365368
Value::Int8Value(v) => Self::Int8(Some(*v as i8)),
366369
Value::Int16Value(v) => Self::Int16(Some(*v as i16)),
@@ -571,6 +574,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
571574
Self::Dictionary(Box::new(index_type), Box::new(value))
572575
}
573576
Value::BinaryValue(v) => Self::Binary(Some(v.clone())),
577+
Value::BinaryViewValue(v) => Self::BinaryView(Some(v.clone())),
574578
Value::LargeBinaryValue(v) => Self::LargeBinary(Some(v.clone())),
575579
Value::IntervalDaytimeValue(v) => Self::IntervalDayTime(Some(
576580
IntervalDayTimeType::make_value(v.days, v.milliseconds),

0 commit comments

Comments
 (0)