@@ -45,6 +45,8 @@ mod primitive;
45
45
use crate :: arrow:: ProjectionMask ;
46
46
pub ( crate ) use complex:: { ParquetField , ParquetFieldType } ;
47
47
48
+ use super :: PARQUET_FIELD_ID_META_KEY ;
49
+
48
50
/// Convert Parquet schema to Arrow schema including optional metadata
49
51
///
50
52
/// Attempts to decode any existing Arrow schema metadata, falling back
@@ -268,12 +270,20 @@ fn parse_key_value_metadata(
268
270
/// Convert parquet column schema to arrow field.
269
271
pub fn parquet_to_arrow_field ( parquet_column : & ColumnDescriptor ) -> Result < Field > {
270
272
let field = complex:: convert_type ( & parquet_column. self_type_ptr ( ) ) ?;
271
-
272
- Ok ( Field :: new (
273
+ let mut ret = Field :: new (
273
274
parquet_column. name ( ) ,
274
275
field. arrow_type ,
275
276
field. nullable ,
276
- ) )
277
+ ) ;
278
+
279
+ let basic_info = parquet_column. self_type ( ) . get_basic_info ( ) ;
280
+ if basic_info. has_id ( ) {
281
+ let mut meta = HashMap :: with_capacity ( 1 ) ;
282
+ meta. insert ( PARQUET_FIELD_ID_META_KEY . to_string ( ) , basic_info. id ( ) . to_string ( ) ) ;
283
+ ret. set_metadata ( meta) ;
284
+ }
285
+
286
+ Ok ( ret)
277
287
}
278
288
279
289
pub fn decimal_length_from_precision ( precision : u8 ) -> usize {
@@ -578,6 +588,7 @@ mod tests {
578
588
579
589
use crate :: arrow:: PARQUET_FIELD_ID_META_KEY ;
580
590
use crate :: file:: metadata:: KeyValue ;
591
+ use crate :: file:: reader:: FileReader ;
581
592
use crate :: {
582
593
arrow:: { arrow_reader:: ParquetRecordBatchReaderBuilder , ArrowWriter } ,
583
594
schema:: { parser:: parse_message_type, types:: SchemaDescriptor } ,
@@ -1811,6 +1822,52 @@ mod tests {
1811
1822
Ok ( ( ) )
1812
1823
}
1813
1824
1825
+ #[ test]
1826
+ fn test_read_parquet_field_ids_raw ( ) -> Result < ( ) > {
1827
+ let meta = |a : & [ ( & str , & str ) ] | -> HashMap < String , String > {
1828
+ a. iter ( )
1829
+ . map ( |( a, b) | ( a. to_string ( ) , b. to_string ( ) ) )
1830
+ . collect ( )
1831
+ } ;
1832
+ let schema = Schema :: new_with_metadata (
1833
+ vec ! [
1834
+ Field :: new( "c1" , DataType :: Utf8 , true ) . with_metadata( meta( & [
1835
+ ( PARQUET_FIELD_ID_META_KEY , "1" ) ,
1836
+ ] ) ) ,
1837
+ Field :: new( "c2" , DataType :: Utf8 , true ) . with_metadata( meta( & [
1838
+ ( PARQUET_FIELD_ID_META_KEY , "2" ) ,
1839
+ ] ) ) ,
1840
+ ] ,
1841
+ HashMap :: new ( ) ,
1842
+ ) ;
1843
+
1844
+ let writer = ArrowWriter :: try_new (
1845
+ vec ! [ ] ,
1846
+ Arc :: new ( schema. clone ( ) ) ,
1847
+ None ,
1848
+ ) ?;
1849
+ let parquet_bytes = writer. into_inner ( ) ?;
1850
+
1851
+ let reader = crate :: file:: reader:: SerializedFileReader :: new (
1852
+ bytes:: Bytes :: from ( parquet_bytes) ,
1853
+ ) ?;
1854
+ let schema_descriptor = reader. metadata ( ) . file_metadata ( ) . schema_descr_ptr ( ) ;
1855
+
1856
+ // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema
1857
+ let arrow_schema = crate :: arrow:: parquet_to_arrow_schema (
1858
+ & schema_descriptor,
1859
+ None ,
1860
+ ) ?;
1861
+
1862
+ let parq_schema_descr = crate :: arrow:: arrow_to_parquet_schema ( & arrow_schema) ?;
1863
+ let parq_fields = parq_schema_descr. root_schema ( ) . get_fields ( ) ;
1864
+ assert_eq ! ( parq_fields. len( ) , 2 ) ;
1865
+ assert_eq ! ( parq_fields[ 0 ] . get_basic_info( ) . id( ) , 1 ) ;
1866
+ assert_eq ! ( parq_fields[ 1 ] . get_basic_info( ) . id( ) , 2 ) ;
1867
+
1868
+ Ok ( ( ) )
1869
+ }
1870
+
1814
1871
#[ test]
1815
1872
fn test_arrow_schema_roundtrip_lists ( ) -> Result < ( ) > {
1816
1873
let metadata: HashMap < String , String > =
0 commit comments