@@ -23,7 +23,7 @@ use std::sync::Arc;
23
23
use arrow:: datatypes:: Schema ;
24
24
use arrow:: { self , datatypes:: SchemaRef } ;
25
25
use async_trait:: async_trait;
26
- use futures :: StreamExt ;
26
+ use datafusion_data_access :: FileMeta ;
27
27
28
28
use super :: FileFormat ;
29
29
use crate :: avro_to_arrow:: read_avro_schema_from_reader;
@@ -32,7 +32,7 @@ use crate::logical_plan::Expr;
32
32
use crate :: physical_plan:: file_format:: { AvroExec , FileScanConfig } ;
33
33
use crate :: physical_plan:: ExecutionPlan ;
34
34
use crate :: physical_plan:: Statistics ;
35
- use datafusion_data_access:: object_store:: { ObjectReader , ObjectReaderStream } ;
35
+ use datafusion_data_access:: object_store:: ObjectStore ;
36
36
37
37
/// The default file extension of avro files
38
38
pub const DEFAULT_AVRO_EXTENSION : & str = ".avro" ;
@@ -46,10 +46,14 @@ impl FileFormat for AvroFormat {
46
46
self
47
47
}
48
48
49
- async fn infer_schema ( & self , mut readers : ObjectReaderStream ) -> Result < SchemaRef > {
49
+ async fn infer_schema (
50
+ & self ,
51
+ store : & Arc < dyn ObjectStore > ,
52
+ files : & [ FileMeta ] ,
53
+ ) -> Result < SchemaRef > {
50
54
let mut schemas = vec ! [ ] ;
51
- while let Some ( obj_reader ) = readers . next ( ) . await {
52
- let mut reader = obj_reader ?. sync_reader ( ) ?;
55
+ for file in files {
56
+ let mut reader = store . file_reader ( file . sized_file . clone ( ) ) ?. sync_reader ( ) ?;
53
57
let schema = read_avro_schema_from_reader ( & mut reader) ?;
54
58
schemas. push ( schema) ;
55
59
}
@@ -59,8 +63,9 @@ impl FileFormat for AvroFormat {
59
63
60
64
async fn infer_stats (
61
65
& self ,
62
- _reader : Arc < dyn ObjectReader > ,
66
+ _store : & Arc < dyn ObjectStore > ,
63
67
_table_schema : SchemaRef ,
68
+ _file : & FileMeta ,
64
69
) -> Result < Statistics > {
65
70
Ok ( Statistics :: default ( ) )
66
71
}
@@ -78,15 +83,9 @@ impl FileFormat for AvroFormat {
78
83
#[ cfg( test) ]
79
84
#[ cfg( feature = "avro" ) ]
80
85
mod tests {
81
- use crate :: {
82
- datafusion_data_access:: object_store:: local:: {
83
- local_object_reader, local_object_reader_stream, LocalFileSystem ,
84
- } ,
85
- physical_plan:: collect,
86
- } ;
87
-
88
86
use super :: * ;
89
- use crate :: datasource:: listing:: local_unpartitioned_file;
87
+ use crate :: datasource:: file_format:: test_util:: scan_format;
88
+ use crate :: physical_plan:: collect;
90
89
use crate :: prelude:: { SessionConfig , SessionContext } ;
91
90
use arrow:: array:: {
92
91
BinaryArray , BooleanArray , Float32Array , Float64Array , Int32Array ,
@@ -100,7 +99,7 @@ mod tests {
100
99
let ctx = SessionContext :: with_config ( config) ;
101
100
let task_ctx = ctx. task_ctx ( ) ;
102
101
let projection = None ;
103
- let exec = get_exec ( "alltypes_plain.avro" , & projection, None ) . await ?;
102
+ let exec = get_exec ( "alltypes_plain.avro" , projection, None ) . await ?;
104
103
let stream = exec. execute ( 0 , task_ctx) ?;
105
104
106
105
let tt_batches = stream
@@ -122,7 +121,7 @@ mod tests {
122
121
let session_ctx = SessionContext :: new ( ) ;
123
122
let task_ctx = session_ctx. task_ctx ( ) ;
124
123
let projection = None ;
125
- let exec = get_exec ( "alltypes_plain.avro" , & projection, Some ( 1 ) ) . await ?;
124
+ let exec = get_exec ( "alltypes_plain.avro" , projection, Some ( 1 ) ) . await ?;
126
125
let batches = collect ( exec, task_ctx) . await ?;
127
126
assert_eq ! ( 1 , batches. len( ) ) ;
128
127
assert_eq ! ( 11 , batches[ 0 ] . num_columns( ) ) ;
@@ -136,7 +135,7 @@ mod tests {
136
135
let session_ctx = SessionContext :: new ( ) ;
137
136
let task_ctx = session_ctx. task_ctx ( ) ;
138
137
let projection = None ;
139
- let exec = get_exec ( "alltypes_plain.avro" , & projection, None ) . await ?;
138
+ let exec = get_exec ( "alltypes_plain.avro" , projection, None ) . await ?;
140
139
141
140
let x: Vec < String > = exec
142
141
. schema ( )
@@ -188,7 +187,7 @@ mod tests {
188
187
let session_ctx = SessionContext :: new ( ) ;
189
188
let task_ctx = session_ctx. task_ctx ( ) ;
190
189
let projection = Some ( vec ! [ 1 ] ) ;
191
- let exec = get_exec ( "alltypes_plain.avro" , & projection, None ) . await ?;
190
+ let exec = get_exec ( "alltypes_plain.avro" , projection, None ) . await ?;
192
191
193
192
let batches = collect ( exec, task_ctx) . await ?;
194
193
assert_eq ! ( batches. len( ) , 1 ) ;
@@ -218,7 +217,7 @@ mod tests {
218
217
let session_ctx = SessionContext :: new ( ) ;
219
218
let task_ctx = session_ctx. task_ctx ( ) ;
220
219
let projection = Some ( vec ! [ 0 ] ) ;
221
- let exec = get_exec ( "alltypes_plain.avro" , & projection, None ) . await ?;
220
+ let exec = get_exec ( "alltypes_plain.avro" , projection, None ) . await ?;
222
221
223
222
let batches = collect ( exec, task_ctx) . await ?;
224
223
assert_eq ! ( batches. len( ) , 1 ) ;
@@ -245,7 +244,7 @@ mod tests {
245
244
let session_ctx = SessionContext :: new ( ) ;
246
245
let task_ctx = session_ctx. task_ctx ( ) ;
247
246
let projection = Some ( vec ! [ 10 ] ) ;
248
- let exec = get_exec ( "alltypes_plain.avro" , & projection, None ) . await ?;
247
+ let exec = get_exec ( "alltypes_plain.avro" , projection, None ) . await ?;
249
248
250
249
let batches = collect ( exec, task_ctx) . await ?;
251
250
assert_eq ! ( batches. len( ) , 1 ) ;
@@ -272,7 +271,7 @@ mod tests {
272
271
let session_ctx = SessionContext :: new ( ) ;
273
272
let task_ctx = session_ctx. task_ctx ( ) ;
274
273
let projection = Some ( vec ! [ 6 ] ) ;
275
- let exec = get_exec ( "alltypes_plain.avro" , & projection, None ) . await ?;
274
+ let exec = get_exec ( "alltypes_plain.avro" , projection, None ) . await ?;
276
275
277
276
let batches = collect ( exec, task_ctx) . await ?;
278
277
assert_eq ! ( batches. len( ) , 1 ) ;
@@ -302,7 +301,7 @@ mod tests {
302
301
let session_ctx = SessionContext :: new ( ) ;
303
302
let task_ctx = session_ctx. task_ctx ( ) ;
304
303
let projection = Some ( vec ! [ 7 ] ) ;
305
- let exec = get_exec ( "alltypes_plain.avro" , & projection, None ) . await ?;
304
+ let exec = get_exec ( "alltypes_plain.avro" , projection, None ) . await ?;
306
305
307
306
let batches = collect ( exec, task_ctx) . await ?;
308
307
assert_eq ! ( batches. len( ) , 1 ) ;
@@ -332,7 +331,7 @@ mod tests {
332
331
let session_ctx = SessionContext :: new ( ) ;
333
332
let task_ctx = session_ctx. task_ctx ( ) ;
334
333
let projection = Some ( vec ! [ 9 ] ) ;
335
- let exec = get_exec ( "alltypes_plain.avro" , & projection, None ) . await ?;
334
+ let exec = get_exec ( "alltypes_plain.avro" , projection, None ) . await ?;
336
335
337
336
let batches = collect ( exec, task_ctx) . await ?;
338
337
assert_eq ! ( batches. len( ) , 1 ) ;
@@ -359,36 +358,13 @@ mod tests {
359
358
360
359
async fn get_exec (
361
360
file_name : & str ,
362
- projection : & Option < Vec < usize > > ,
361
+ projection : Option < Vec < usize > > ,
363
362
limit : Option < usize > ,
364
363
) -> Result < Arc < dyn ExecutionPlan > > {
365
364
let testdata = crate :: test_util:: arrow_test_data ( ) ;
366
- let filename = format ! ( "{}/avro/{} " , testdata, file_name ) ;
365
+ let store_root = format ! ( "{}/avro" , testdata) ;
367
366
let format = AvroFormat { } ;
368
- let file_schema = format
369
- . infer_schema ( local_object_reader_stream ( vec ! [ filename. clone( ) ] ) )
370
- . await
371
- . expect ( "Schema inference" ) ;
372
- let statistics = format
373
- . infer_stats ( local_object_reader ( filename. clone ( ) ) , file_schema. clone ( ) )
374
- . await
375
- . expect ( "Stats inference" ) ;
376
- let file_groups = vec ! [ vec![ local_unpartitioned_file( filename. to_owned( ) ) ] ] ;
377
- let exec = format
378
- . create_physical_plan (
379
- FileScanConfig {
380
- object_store : Arc :: new ( LocalFileSystem { } ) ,
381
- file_schema,
382
- file_groups,
383
- statistics,
384
- projection : projection. clone ( ) ,
385
- limit,
386
- table_partition_cols : vec ! [ ] ,
387
- } ,
388
- & [ ] ,
389
- )
390
- . await ?;
391
- Ok ( exec)
367
+ scan_format ( & format, & store_root, file_name, projection, limit) . await
392
368
}
393
369
}
394
370
@@ -397,18 +373,17 @@ mod tests {
397
373
mod tests {
398
374
use super :: * ;
399
375
400
- use crate :: datafusion_data_access :: object_store :: local :: local_object_reader_stream ;
376
+ use super :: super :: test_util :: scan_format ;
401
377
use crate :: error:: DataFusionError ;
402
378
403
379
#[ tokio:: test]
404
380
async fn test ( ) -> Result < ( ) > {
381
+ let format = AvroFormat { } ;
405
382
let testdata = crate :: test_util:: arrow_test_data ( ) ;
406
- let filename = format ! ( "{}/avro/alltypes_plain.avro" , testdata) ;
407
- let schema_result = AvroFormat { }
408
- . infer_schema ( local_object_reader_stream ( vec ! [ filename] ) )
409
- . await ;
383
+ let filename = "avro/alltypes_plain.avro" ;
384
+ let result = scan_format ( & format, & testdata, filename, None , None ) . await ;
410
385
assert ! ( matches!(
411
- schema_result ,
386
+ result ,
412
387
Err ( DataFusionError :: NotImplemented ( msg) )
413
388
if msg == * "cannot read avro schema without the 'avro' feature enabled"
414
389
) ) ;
0 commit comments