Skip to content

Commit 8c495b6

Browse files
parquet: Read field IDs from Parquet Schema (#4878)
Currently, field ids are only read from the serialized arrow schema and not the actual parquet file. This PR adds reading the field ids from a Parquet file that doesnt contain the serialized arrow schema. Signed-off-by: 🐼 Samrose Ahmed 🐼 <[email protected]>
1 parent 3ac0053 commit 8c495b6

File tree

2 files changed

+71
-5
lines changed

2 files changed

+71
-5
lines changed

parquet/src/arrow/schema/complex.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::collections::HashMap;
1919
use std::sync::Arc;
2020

2121
use crate::arrow::schema::primitive::convert_primitive;
22-
use crate::arrow::ProjectionMask;
22+
use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY};
2323
use crate::basic::{ConvertedType, Repetition};
2424
use crate::errors::ParquetError;
2525
use crate::errors::Result;
@@ -550,7 +550,16 @@ fn convert_field(
550550

551551
field.with_metadata(hint.metadata().clone())
552552
}
553-
None => Field::new(name, data_type, nullable),
553+
None => {
554+
let mut ret = Field::new(name, data_type, nullable);
555+
let basic_info = parquet_type.get_basic_info();
556+
if basic_info.has_id() {
557+
let mut meta = HashMap::with_capacity(1);
558+
meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(), basic_info.id().to_string());
559+
ret.set_metadata(meta);
560+
}
561+
ret
562+
},
554563
}
555564
}
556565

parquet/src/arrow/schema/mod.rs

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ mod primitive;
4545
use crate::arrow::ProjectionMask;
4646
pub(crate) use complex::{ParquetField, ParquetFieldType};
4747

48+
use super::PARQUET_FIELD_ID_META_KEY;
49+
4850
/// Convert Parquet schema to Arrow schema including optional metadata
4951
///
5052
/// Attempts to decode any existing Arrow schema metadata, falling back
@@ -268,12 +270,20 @@ fn parse_key_value_metadata(
268270
/// Convert parquet column schema to arrow field.
269271
pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result<Field> {
270272
let field = complex::convert_type(&parquet_column.self_type_ptr())?;
271-
272-
Ok(Field::new(
273+
let mut ret = Field::new(
273274
parquet_column.name(),
274275
field.arrow_type,
275276
field.nullable,
276-
))
277+
);
278+
279+
let basic_info = parquet_column.self_type().get_basic_info();
280+
if basic_info.has_id() {
281+
let mut meta = HashMap::with_capacity(1);
282+
meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(), basic_info.id().to_string());
283+
ret.set_metadata(meta);
284+
}
285+
286+
Ok(ret)
277287
}
278288

279289
pub fn decimal_length_from_precision(precision: u8) -> usize {
@@ -578,6 +588,7 @@ mod tests {
578588

579589
use crate::arrow::PARQUET_FIELD_ID_META_KEY;
580590
use crate::file::metadata::KeyValue;
591+
use crate::file::reader::FileReader;
581592
use crate::{
582593
arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter},
583594
schema::{parser::parse_message_type, types::SchemaDescriptor},
@@ -1811,6 +1822,52 @@ mod tests {
18111822
Ok(())
18121823
}
18131824

1825+
#[test]
1826+
fn test_read_parquet_field_ids_raw() -> Result<()> {
1827+
let meta = |a: &[(&str, &str)]| -> HashMap<String, String> {
1828+
a.iter()
1829+
.map(|(a, b)| (a.to_string(), b.to_string()))
1830+
.collect()
1831+
};
1832+
let schema = Schema::new_with_metadata(
1833+
vec![
1834+
Field::new("c1", DataType::Utf8, true).with_metadata(meta(&[
1835+
(PARQUET_FIELD_ID_META_KEY, "1"),
1836+
])),
1837+
Field::new("c2", DataType::Utf8, true).with_metadata(meta(&[
1838+
(PARQUET_FIELD_ID_META_KEY, "2"),
1839+
])),
1840+
],
1841+
HashMap::new(),
1842+
);
1843+
1844+
let writer = ArrowWriter::try_new(
1845+
vec![],
1846+
Arc::new(schema.clone()),
1847+
None,
1848+
)?;
1849+
let parquet_bytes = writer.into_inner()?;
1850+
1851+
let reader = crate::file::reader::SerializedFileReader::new(
1852+
bytes::Bytes::from(parquet_bytes),
1853+
)?;
1854+
let schema_descriptor = reader.metadata().file_metadata().schema_descr_ptr();
1855+
1856+
// don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema
1857+
let arrow_schema = crate::arrow::parquet_to_arrow_schema(
1858+
&schema_descriptor,
1859+
None,
1860+
)?;
1861+
1862+
let parq_schema_descr = crate::arrow::arrow_to_parquet_schema(&arrow_schema)?;
1863+
let parq_fields = parq_schema_descr.root_schema().get_fields();
1864+
assert_eq!(parq_fields.len(), 2);
1865+
assert_eq!(parq_fields[0].get_basic_info().id(), 1);
1866+
assert_eq!(parq_fields[1].get_basic_info().id(), 2);
1867+
1868+
Ok(())
1869+
}
1870+
18141871
#[test]
18151872
fn test_arrow_schema_roundtrip_lists() -> Result<()> {
18161873
let metadata: HashMap<String, String> =

0 commit comments

Comments
 (0)