@@ -225,29 +225,121 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
225
225
}
226
226
}
227
227
228
+ /// Converter for arrow schema to parquet schema
229
+ ///
230
+ /// Example:
231
+ /// ```
232
+ /// # use std::sync::Arc;
233
+ /// use arrow_schema::{Field, Schema, DataType};
234
+ /// use parquet::arrow::ArrowToParquetSchemaConverter;
235
+ /// use parquet::schema::types::{SchemaDescriptor, Type};
236
+ /// use parquet::basic;
237
+ /// let arrow_schema = Schema::new(vec![
238
+ /// Field::new("a", DataType::Int64, true),
239
+ /// Field::new("b", DataType::Date32, true),
240
+ /// ]);
241
+ ///
242
+ /// let parquet_schema = ArrowToParquetSchemaConverter::new(&arrow_schema)
243
+ /// .build()
244
+ /// .unwrap();
245
+ /// //
246
+ /// let expected_parquet_schema = SchemaDescriptor::new(
247
+ /// Arc::new(
248
+ /// Type::group_type_builder("arrow_schema")
249
+ /// .with_fields(vec![
250
+ /// Arc::new(
251
+ /// Type::primitive_type_builder("a", basic::Type::INT64)
252
+ /// .build().unwrap()
253
+ /// ),
254
+ /// Arc::new(
255
+ /// Type::primitive_type_builder("b", basic::Type::INT32)
256
+ /// .with_converted_type(basic::ConvertedType::DATE)
257
+ /// .with_logical_type(Some(basic::LogicalType::Date))
258
+ /// .build().unwrap()
259
+ /// ),
260
+ /// ])
261
+ /// .build().unwrap()
262
+ /// )
263
+ /// );
264
+ ///
265
+ /// assert_eq!(parquet_schema, expected_parquet_schema);
266
+ /// ```
267
+ #[ derive( Debug ) ]
268
+ pub struct ArrowToParquetSchemaConverter < ' a > {
269
+ /// The schema to convert
270
+ schema : & ' a Schema ,
271
+ /// Name of the root schema in Parquet
272
+ schema_root : & ' a str ,
273
+ /// Should we Coerce arrow types to compatible Parquet types?
274
+ ///
275
+ /// See docs on [Self::with_coerce_types]`
276
+ coerce_types : bool
277
+ }
278
+
279
+ impl < ' a > ArrowToParquetSchemaConverter < ' a > {
280
+ /// Create a new converter
281
+ pub fn new ( schema : & ' a Schema ) -> Self {
282
+ Self {
283
+ schema,
284
+ schema_root : "arrow_schema" ,
285
+ coerce_types : false ,
286
+ }
287
+ }
288
+
289
+ /// Should arrow types be coerced into parquet native types (default false).
290
+ ///
291
+ /// Setting this option to `true` will result in parquet files that can be
292
+ /// read by more readers, but may lose precision for arrow types such as
293
+ /// [`DataType::Date64`] which have no direct corresponding Parquet type.
294
+ ///
295
+ /// # Discussion
296
+ ///
297
+ /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no
298
+ /// corresponding Parquet logical type. Thus, they can not be losslessly
299
+ /// round-tripped when stored using the appropriate Parquet logical type.
300
+ ///
301
+ /// For example, some Date64 values may be truncated when stored with
302
+ /// parquet's native 32 bit date type.
303
+ ///
304
+ /// By default, the arrow writer does not coerce to native parquet types. It
305
+ /// writes data in such a way that it can be lossless round tripped.
306
+ /// However, this means downstream readers must be aware of and correctly
307
+ /// interpret the embedded Arrow schema.
308
+ pub fn with_coerce_types ( mut self , coerce_types : bool ) -> Self {
309
+ self . coerce_types = coerce_types;
310
+ self
311
+ }
312
+
313
+ /// Set the root schema element name (defaults to `"arrow_schema"`).
314
+ pub fn schema_root ( mut self , schema_root : & ' a str ) -> Self {
315
+ self . schema_root = schema_root;
316
+ self
317
+ }
318
+
319
+ /// Build the desired parquet [`SchemaDescriptor`]
320
+ pub fn build ( self ) -> Result < SchemaDescriptor > {
321
+ let Self { schema, schema_root : root_schema_name, coerce_types } = self ;
322
+ let fields = schema
323
+ . fields ( )
324
+ . iter ( )
325
+ . map ( |field| arrow_to_parquet_type ( field, coerce_types) . map ( Arc :: new) )
326
+ . collect :: < Result < _ > > ( ) ?;
327
+ let group = Type :: group_type_builder ( root_schema_name) . with_fields ( fields) . build ( ) ?;
328
+ Ok ( SchemaDescriptor :: new ( Arc :: new ( group) ) )
329
+ }
330
+ }
331
+
228
332
/// Convert arrow schema to parquet schema
229
333
///
230
334
/// The name of the root schema element defaults to `"arrow_schema"`, this can be
231
335
/// overridden with [`arrow_to_parquet_schema_with_root`]
232
- pub fn arrow_to_parquet_schema ( schema : & Schema , coerce_types : bool ) -> Result < SchemaDescriptor > {
233
- arrow_to_parquet_schema_with_root ( schema, "arrow_schema" , coerce_types)
234
- }
336
+ #[ deprecated( since = "54.0.0" , note = "Use `ArrowToParquetSchemaConverter` instead" ) ]
337
+ pub fn arrow_to_parquet_schema ( schema : & Schema ) -> Result < SchemaDescriptor > {
235
338
236
- /// Convert arrow schema to parquet schema specifying the name of the root schema element
237
- pub fn arrow_to_parquet_schema_with_root (
238
- schema : & Schema ,
239
- root : & str ,
240
- coerce_types : bool ,
241
- ) -> Result < SchemaDescriptor > {
242
- let fields = schema
243
- . fields ( )
244
- . iter ( )
245
- . map ( |field| arrow_to_parquet_type ( field, coerce_types) . map ( Arc :: new) )
246
- . collect :: < Result < _ > > ( ) ?;
247
- let group = Type :: group_type_builder ( root) . with_fields ( fields) . build ( ) ?;
248
- Ok ( SchemaDescriptor :: new ( Arc :: new ( group) ) )
339
+ ArrowToParquetSchemaConverter :: new ( schema) . build ( )
249
340
}
250
341
342
+
251
343
fn parse_key_value_metadata (
252
344
key_value_metadata : Option < & Vec < KeyValue > > ,
253
345
) -> Option < HashMap < String , String > > {
@@ -1569,7 +1661,7 @@ mod tests {
1569
1661
Field :: new( "decimal256" , DataType :: Decimal256 ( 39 , 2 ) , false ) ,
1570
1662
] ;
1571
1663
let arrow_schema = Schema :: new ( arrow_fields) ;
1572
- let converted_arrow_schema = arrow_to_parquet_schema ( & arrow_schema, false ) . unwrap ( ) ;
1664
+ let converted_arrow_schema = ArrowToParquetSchemaConverter :: new ( & arrow_schema) . build ( ) . unwrap ( ) ;
1573
1665
1574
1666
assert_eq ! (
1575
1667
parquet_schema. columns( ) . len( ) ,
@@ -1606,9 +1698,10 @@ mod tests {
1606
1698
false ,
1607
1699
) ] ;
1608
1700
let arrow_schema = Schema :: new ( arrow_fields) ;
1609
- let converted_arrow_schema = arrow_to_parquet_schema ( & arrow_schema, true ) ;
1701
+ let converted_arrow_schema = ArrowToParquetSchemaConverter :: new ( & arrow_schema)
1702
+ . with_coerce_types ( true )
1703
+ . build ( ) ;
1610
1704
1611
- assert ! ( converted_arrow_schema. is_err( ) ) ;
1612
1705
converted_arrow_schema. unwrap ( ) ;
1613
1706
}
1614
1707
@@ -1878,7 +1971,9 @@ mod tests {
1878
1971
// don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema
1879
1972
let arrow_schema = crate :: arrow:: parquet_to_arrow_schema ( & schema_descriptor, None ) ?;
1880
1973
1881
- let parq_schema_descr = crate :: arrow:: arrow_to_parquet_schema ( & arrow_schema, true ) ?;
1974
+ let parq_schema_descr = crate :: arrow:: ArrowToParquetSchemaConverter :: new ( & arrow_schema)
1975
+ . with_coerce_types ( true )
1976
+ . build ( ) ?;
1882
1977
let parq_fields = parq_schema_descr. root_schema ( ) . get_fields ( ) ;
1883
1978
assert_eq ! ( parq_fields. len( ) , 2 ) ;
1884
1979
assert_eq ! ( parq_fields[ 0 ] . get_basic_info( ) . id( ) , 1 ) ;
0 commit comments