@@ -25,13 +25,14 @@ use futures::channel::oneshot;
25
25
use futures:: future:: join_all;
26
26
use futures:: { StreamExt , TryStreamExt } ;
27
27
28
+ use crate :: arrow:: record_batch_transformer:: RecordBatchTransformer ;
28
29
use crate :: arrow:: ArrowReader ;
29
30
use crate :: delete_vector:: DeleteVector ;
30
31
use crate :: expr:: Predicate :: AlwaysTrue ;
31
32
use crate :: expr:: { Bind , BoundPredicate , Predicate } ;
32
33
use crate :: io:: FileIO ;
33
34
use crate :: scan:: { ArrowRecordBatchStream , FileScanTask , FileScanTaskDeleteFile } ;
34
- use crate :: spec:: DataContentType ;
35
+ use crate :: spec:: { DataContentType , Schema , SchemaRef } ;
35
36
use crate :: { Error , ErrorKind , Result } ;
36
37
37
38
#[ allow( unused) ]
@@ -164,7 +165,7 @@ impl CachingDeleteFileManager {
164
165
/// * The unbound Predicates resulting from equality deletes are sent to their associated oneshot
165
166
/// channel to store them in the right place in the delete file managers state.
166
167
/// * The results of all of these futures are awaited on in parallel with the specified
167
- /// level of concurrency and collected into a vec. We then combine all of the delete
168
+ /// level of concurrency and collected into a vec. We then combine all the delete
168
169
/// vector maps that resulted from any positional delete or delete vector files into a
169
170
/// single map and persist it in the state.
170
171
///
@@ -206,19 +207,25 @@ impl CachingDeleteFileManager {
206
207
pub ( crate ) async fn load_deletes (
207
208
& self ,
208
209
delete_file_entries : & [ FileScanTaskDeleteFile ] ,
210
+ schema : SchemaRef ,
209
211
) -> Result < ( ) > {
210
212
let stream_items = delete_file_entries
211
213
. iter ( )
212
- . map ( |t| ( t. clone ( ) , self . file_io . clone ( ) , self . state . clone ( ) ) )
214
+ . map ( |t| (
215
+ t. clone ( ) ,
216
+ self . file_io . clone ( ) ,
217
+ self . state . clone ( ) ,
218
+ schema. clone ( ) ,
219
+ ) )
213
220
. collect :: < Vec < _ > > ( ) ;
214
221
// NOTE: removing the collect and just passing the iterator to futures::stream:iter
215
222
// results in an error 'implementation of `std::ops::FnOnce` is not general enough'
216
223
217
224
let task_stream = futures:: stream:: iter ( stream_items. into_iter ( ) ) ;
218
225
219
226
let results: Vec < ParsedDeleteFileContext > = task_stream
220
- . map ( move |( task, file_io, state_ref) | async {
221
- Self :: load_file_for_task ( task, file_io, state_ref) . await
227
+ . map ( move |( task, file_io, state_ref, schema ) | async {
228
+ Self :: load_file_for_task ( task, file_io, state_ref, schema ) . await
222
229
} )
223
230
. map ( move |ctx| Ok ( async { Self :: parse_file_content_for_task ( ctx. await ?) . await } ) )
224
231
. try_buffer_unordered ( self . concurrency_limit_data_files )
@@ -248,6 +255,7 @@ impl CachingDeleteFileManager {
248
255
task : FileScanTaskDeleteFile ,
249
256
file_io : FileIO ,
250
257
state : StateRef ,
258
+ schema : SchemaRef ,
251
259
) -> Result < DeleteFileContext > {
252
260
match task. file_type {
253
261
DataContentType :: PositionDeletes => Ok ( DeleteFileContext :: PosDels (
@@ -271,7 +279,11 @@ impl CachingDeleteFileManager {
271
279
} ;
272
280
273
281
Ok ( DeleteFileContext :: FreshEqDel {
274
- batch_stream : Self :: parquet_to_batch_stream ( & task. file_path , file_io) . await ?,
282
+ batch_stream : Self :: evolve_schema (
283
+ Self :: parquet_to_batch_stream ( & task. file_path , file_io) . await ?,
284
+ schema,
285
+ )
286
+ . await ?,
275
287
sender,
276
288
} )
277
289
}
@@ -351,6 +363,30 @@ impl CachingDeleteFileManager {
351
363
Ok ( Box :: pin ( record_batch_stream) as ArrowRecordBatchStream )
352
364
}
353
365
366
+ /// Evolves the schema of the RecordBatches from an equality delete file
367
+ async fn evolve_schema (
368
+ record_batch_stream : ArrowRecordBatchStream ,
369
+ target_schema : Arc < Schema > ,
370
+ ) -> Result < ArrowRecordBatchStream > {
371
+ let eq_ids = target_schema
372
+ . as_ref ( )
373
+ . field_id_to_name_map ( )
374
+ . keys ( )
375
+ . cloned ( )
376
+ . collect :: < Vec < _ > > ( ) ;
377
+
378
+ let mut record_batch_transformer =
379
+ RecordBatchTransformer :: build ( target_schema. clone ( ) , & eq_ids) ;
380
+
381
+ let record_batch_stream = record_batch_stream. map ( move |record_batch| {
382
+ record_batch. and_then ( |record_batch|
383
+ record_batch_transformer. process_record_batch ( record_batch)
384
+ )
385
+ } ) ;
386
+
387
+ Ok ( Box :: pin ( record_batch_stream) as ArrowRecordBatchStream )
388
+ }
389
+
354
390
/// Parses a record batch stream coming from positional delete files
355
391
///
356
392
/// Returns a map of data file path to a delete vector
@@ -483,7 +519,7 @@ mod tests {
483
519
let file_scan_tasks = setup ( table_location) ;
484
520
485
521
let result = delete_file_manager
486
- . load_deletes ( & file_scan_tasks[ 0 ] . deletes )
522
+ . load_deletes ( & file_scan_tasks[ 0 ] . deletes , file_scan_tasks [ 0 ] . schema_ref ( ) )
487
523
. await ;
488
524
489
525
assert ! ( result. is_err_and( |e| e. kind( ) == ErrorKind :: FeatureUnsupported ) ) ;
0 commit comments