@@ -28,7 +28,7 @@ use datafusion::arrow::record_batch::RecordBatch;
28
28
use datafusion:: datasource:: datasource:: TableProvider ;
29
29
use datafusion:: datasource:: MemTable ;
30
30
use datafusion:: execution:: context:: { SessionConfig , SessionContext } ;
31
- use datafusion:: prelude:: { CsvReadOptions , ParquetReadOptions } ;
31
+ use datafusion:: prelude:: { AvroReadOptions , CsvReadOptions , ParquetReadOptions } ;
32
32
33
33
use crate :: catalog:: { PyCatalog , PyTable } ;
34
34
use crate :: dataframe:: PyDataFrame ;
@@ -264,4 +264,99 @@ impl PySessionContext {
264
264
fn session_id ( & self ) -> PyResult < String > {
265
265
Ok ( self . ctx . session_id ( ) )
266
266
}
267
+
268
+ #[ allow( clippy:: too_many_arguments) ]
269
+ #[ args(
270
+ schema = "None" ,
271
+ has_header = "true" ,
272
+ delimiter = "\" ,\" " ,
273
+ schema_infer_max_records = "1000" ,
274
+ file_extension = "\" .csv\" " ,
275
+ table_partition_cols = "vec![]"
276
+ ) ]
277
+ fn read_csv (
278
+ & self ,
279
+ path : PathBuf ,
280
+ schema : Option < Schema > ,
281
+ has_header : bool ,
282
+ delimiter : & str ,
283
+ schema_infer_max_records : usize ,
284
+ file_extension : & str ,
285
+ table_partition_cols : Vec < String > ,
286
+ py : Python ,
287
+ ) -> PyResult < PyDataFrame > {
288
+ let path = path
289
+ . to_str ( )
290
+ . ok_or_else ( || PyValueError :: new_err ( "Unable to convert path to a string" ) ) ?;
291
+
292
+ let delimiter = delimiter. as_bytes ( ) ;
293
+ if delimiter. len ( ) != 1 {
294
+ return Err ( PyValueError :: new_err (
295
+ "Delimiter must be a single character" ,
296
+ ) ) ;
297
+ } ;
298
+
299
+ let mut options = CsvReadOptions :: new ( )
300
+ . has_header ( has_header)
301
+ . delimiter ( delimiter[ 0 ] )
302
+ . schema_infer_max_records ( schema_infer_max_records)
303
+ . file_extension ( file_extension)
304
+ . table_partition_cols ( table_partition_cols) ;
305
+ options. schema = schema. as_ref ( ) ;
306
+
307
+ let result = self . ctx . read_csv ( path, options) ;
308
+ let df = PyDataFrame :: new ( wait_for_future ( py, result) . map_err ( DataFusionError :: from) ?) ;
309
+
310
+ Ok ( df)
311
+ }
312
+
313
+ #[ allow( clippy:: too_many_arguments) ]
314
+ #[ args(
315
+ parquet_pruning = "true" ,
316
+ file_extension = "\" .parquet\" " ,
317
+ table_partition_cols = "vec![]" ,
318
+ skip_metadata = "true"
319
+ ) ]
320
+ fn read_parquet (
321
+ & self ,
322
+ path : & str ,
323
+ table_partition_cols : Vec < String > ,
324
+ parquet_pruning : bool ,
325
+ file_extension : & str ,
326
+ skip_metadata : bool ,
327
+ py : Python ,
328
+ ) -> PyResult < PyDataFrame > {
329
+ let mut options = ParquetReadOptions :: default ( )
330
+ . table_partition_cols ( table_partition_cols)
331
+ . parquet_pruning ( parquet_pruning)
332
+ . skip_metadata ( skip_metadata) ;
333
+ options. file_extension = file_extension;
334
+
335
+ let result = self . ctx . read_parquet ( path, options) ;
336
+ let df = PyDataFrame :: new ( wait_for_future ( py, result) . map_err ( DataFusionError :: from) ?) ;
337
+ Ok ( df)
338
+ }
339
+
340
+ #[ allow( clippy:: too_many_arguments) ]
341
+ #[ args(
342
+ schema = "None" ,
343
+ file_extension = "\" .avro\" " ,
344
+ table_partition_cols = "vec![]"
345
+ ) ]
346
+ fn read_avro (
347
+ & self ,
348
+ path : & str ,
349
+ schema : Option < Schema > ,
350
+ table_partition_cols : Vec < String > ,
351
+ file_extension : & str ,
352
+ py : Python ,
353
+ ) -> PyResult < PyDataFrame > {
354
+ let mut options = AvroReadOptions :: default ( ) . table_partition_cols ( table_partition_cols) ;
355
+ options. file_extension = file_extension;
356
+ options. schema = schema. map ( Arc :: new) ;
357
+
358
+ let result = self . ctx . read_avro ( path, options) ;
359
+ let df = PyDataFrame :: new ( wait_for_future ( py, result) . map_err ( DataFusionError :: from) ?) ;
360
+ Ok ( df)
361
+ }
267
362
}
0 commit comments