Skip to content

Commit 93ce75c

Browse files
authored
Correctly handling nullable in CSV parser (#6830)
1 parent fa6d5e1 commit 93ce75c

File tree

2 files changed

+69
-8
lines changed

2 files changed

+69
-8
lines changed

arrow-csv/src/reader/mod.rs

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -779,42 +779,66 @@ fn parse(
779779
match key_type.as_ref() {
780780
DataType::Int8 => Ok(Arc::new(
781781
rows.iter()
782-
.map(|row| row.get(i))
782+
.map(|row| {
783+
let s = row.get(i);
784+
(!null_regex.is_null(s)).then_some(s)
785+
})
783786
.collect::<DictionaryArray<Int8Type>>(),
784787
) as ArrayRef),
785788
DataType::Int16 => Ok(Arc::new(
786789
rows.iter()
787-
.map(|row| row.get(i))
790+
.map(|row| {
791+
let s = row.get(i);
792+
(!null_regex.is_null(s)).then_some(s)
793+
})
788794
.collect::<DictionaryArray<Int16Type>>(),
789795
) as ArrayRef),
790796
DataType::Int32 => Ok(Arc::new(
791797
rows.iter()
792-
.map(|row| row.get(i))
798+
.map(|row| {
799+
let s = row.get(i);
800+
(!null_regex.is_null(s)).then_some(s)
801+
})
793802
.collect::<DictionaryArray<Int32Type>>(),
794803
) as ArrayRef),
795804
DataType::Int64 => Ok(Arc::new(
796805
rows.iter()
797-
.map(|row| row.get(i))
806+
.map(|row| {
807+
let s = row.get(i);
808+
(!null_regex.is_null(s)).then_some(s)
809+
})
798810
.collect::<DictionaryArray<Int64Type>>(),
799811
) as ArrayRef),
800812
DataType::UInt8 => Ok(Arc::new(
801813
rows.iter()
802-
.map(|row| row.get(i))
814+
.map(|row| {
815+
let s = row.get(i);
816+
(!null_regex.is_null(s)).then_some(s)
817+
})
803818
.collect::<DictionaryArray<UInt8Type>>(),
804819
) as ArrayRef),
805820
DataType::UInt16 => Ok(Arc::new(
806821
rows.iter()
807-
.map(|row| row.get(i))
822+
.map(|row| {
823+
let s = row.get(i);
824+
(!null_regex.is_null(s)).then_some(s)
825+
})
808826
.collect::<DictionaryArray<UInt16Type>>(),
809827
) as ArrayRef),
810828
DataType::UInt32 => Ok(Arc::new(
811829
rows.iter()
812-
.map(|row| row.get(i))
830+
.map(|row| {
831+
let s = row.get(i);
832+
(!null_regex.is_null(s)).then_some(s)
833+
})
813834
.collect::<DictionaryArray<UInt32Type>>(),
814835
) as ArrayRef),
815836
DataType::UInt64 => Ok(Arc::new(
816837
rows.iter()
817-
.map(|row| row.get(i))
838+
.map(|row| {
839+
let s = row.get(i);
840+
(!null_regex.is_null(s)).then_some(s)
841+
})
818842
.collect::<DictionaryArray<UInt64Type>>(),
819843
) as ArrayRef),
820844
_ => Err(ArrowError::ParseError(format!(
@@ -1475,6 +1499,40 @@ mod tests {
14751499
assert_eq!(strings.value(29), "Uckfield, East Sussex, UK");
14761500
}
14771501

1502+
#[test]
1503+
fn test_csv_with_nullable_dictionary() {
1504+
let offset_type = vec![
1505+
DataType::Int8,
1506+
DataType::Int16,
1507+
DataType::Int32,
1508+
DataType::Int64,
1509+
DataType::UInt8,
1510+
DataType::UInt16,
1511+
DataType::UInt32,
1512+
DataType::UInt64,
1513+
];
1514+
for data_type in offset_type {
1515+
let file = File::open("test/data/dictionary_nullable_test.csv").unwrap();
1516+
let dictionary_type =
1517+
DataType::Dictionary(Box::new(data_type), Box::new(DataType::Utf8));
1518+
let schema = Arc::new(Schema::new(vec![
1519+
Field::new("id", DataType::Utf8, false),
1520+
Field::new("name", dictionary_type.clone(), true),
1521+
]));
1522+
1523+
let mut csv = ReaderBuilder::new(schema)
1524+
.build(file.try_clone().unwrap())
1525+
.unwrap();
1526+
1527+
let batch = csv.next().unwrap().unwrap();
1528+
assert_eq!(3, batch.num_rows());
1529+
assert_eq!(2, batch.num_columns());
1530+
1531+
let names = arrow_cast::cast(batch.column(1), &dictionary_type).unwrap();
1532+
assert!(!names.is_null(2));
1533+
assert!(names.is_null(1));
1534+
}
1535+
}
14781536
#[test]
14791537
fn test_nulls() {
14801538
let schema = Arc::new(Schema::new(vec![
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
id,name
2+
1,
3+
2,bob

0 commit comments

Comments
 (0)