17
17
18
18
use std:: sync:: Arc ;
19
19
20
- use arrow:: array:: { ArrayRef , Int32Array } ;
20
+ use arrow:: array:: { ArrayRef , Int32Array , StringArray } ;
21
21
use arrow:: compute:: { concat_batches, SortOptions } ;
22
22
use arrow:: datatypes:: SchemaRef ;
23
23
use arrow:: record_batch:: RecordBatch ;
@@ -45,6 +45,7 @@ use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
45
45
use test_utils:: add_empty_batches;
46
46
47
47
use hashbrown:: HashMap ;
48
+ use rand:: distributions:: Alphanumeric ;
48
49
use rand:: rngs:: StdRng ;
49
50
use rand:: { Rng , SeedableRng } ;
50
51
@@ -607,25 +608,6 @@ fn convert_bound_to_current_row_if_applicable(
607
608
}
608
609
}
609
610
610
- /// This utility determines whether a given window frame can be executed with
611
- /// multiple ORDER BY expressions. As an example, range frames with offset (such
612
- /// as `RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING`) cannot have ORDER BY clauses
613
- /// of the form `\[ORDER BY a ASC, b ASC, ...]`
614
- fn can_accept_multi_orderby ( window_frame : & WindowFrame ) -> bool {
615
- match window_frame. units {
616
- WindowFrameUnits :: Rows => true ,
617
- WindowFrameUnits :: Range => {
618
- // Range can only accept multi ORDER BY clauses when bounds are
619
- // CURRENT ROW or UNBOUNDED PRECEDING/FOLLOWING:
620
- ( window_frame. start_bound . is_unbounded ( )
621
- || window_frame. start_bound == WindowFrameBound :: CurrentRow )
622
- && ( window_frame. end_bound . is_unbounded ( )
623
- || window_frame. end_bound == WindowFrameBound :: CurrentRow )
624
- }
625
- WindowFrameUnits :: Groups => true ,
626
- }
627
- }
628
-
629
611
/// Perform batch and running window same input
630
612
/// and verify outputs of `WindowAggExec` and `BoundedWindowAggExec` are equal
631
613
async fn run_window_test (
@@ -649,7 +631,7 @@ async fn run_window_test(
649
631
options : SortOptions :: default ( ) ,
650
632
} )
651
633
}
652
- if orderby_exprs. len ( ) > 1 && !can_accept_multi_orderby ( & window_frame ) {
634
+ if orderby_exprs. len ( ) > 1 && !window_frame . can_accept_multi_orderby ( ) {
653
635
orderby_exprs = orderby_exprs[ 0 ..1 ] . to_vec ( ) ;
654
636
}
655
637
let mut partitionby_exprs = vec ! [ ] ;
@@ -733,11 +715,30 @@ async fn run_window_test(
733
715
) ?) as _ ;
734
716
let task_ctx = ctx. task_ctx ( ) ;
735
717
let collected_usual = collect ( usual_window_exec, task_ctx. clone ( ) ) . await ?;
736
- let collected_running = collect ( running_window_exec, task_ctx) . await ?;
718
+ let collected_running = collect ( running_window_exec, task_ctx)
719
+ . await ?
720
+ . into_iter ( )
721
+ . filter ( |b| b. num_rows ( ) > 0 )
722
+ . collect :: < Vec < _ > > ( ) ;
737
723
738
724
// BoundedWindowAggExec should produce more chunk than the usual WindowAggExec.
739
725
// Otherwise it means that we cannot generate result in running mode.
740
- assert ! ( collected_running. len( ) > collected_usual. len( ) ) ;
726
+ let err_msg = format ! ( "Inconsistent result for window_frame: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, random_seed: {random_seed:?}, search_mode: {search_mode:?}, partition_by_columns:{partition_by_columns:?}, orderby_columns: {orderby_columns:?}" ) ;
727
+ // Below check makes sure that, streaming execution generates more chunks than the bulk execution.
728
+ // Since algorithms and operators works on sliding windows in the streaming execution.
729
+ // However, in the current test setup for some random generated window frame clauses: It is not guaranteed
730
+ // for streaming execution to generate more chunk than its non-streaming counter part in the Linear mode.
731
+ // As an example window frame `OVER(PARTITION BY d ORDER BY a RANGE BETWEEN CURRENT ROW AND 9 FOLLOWING)`
732
+ // needs to receive a=10 to generate result for the rows where a=0. If the input data generated is between the range [0, 9].
733
+ // even in streaming mode, generated result will be single bulk as in the non-streaming version.
734
+ if search_mode != Linear {
735
+ assert ! (
736
+ collected_running. len( ) > collected_usual. len( ) ,
737
+ "{}" ,
738
+ err_msg
739
+ ) ;
740
+ }
741
+
741
742
// compare
742
743
let usual_formatted = pretty_format_batches ( & collected_usual) ?. to_string ( ) ;
743
744
let running_formatted = pretty_format_batches ( & collected_running) ?. to_string ( ) ;
@@ -767,10 +768,17 @@ async fn run_window_test(
767
768
Ok ( ( ) )
768
769
}
769
770
771
+ fn generate_random_string ( rng : & mut StdRng , length : usize ) -> String {
772
+ rng. sample_iter ( & Alphanumeric )
773
+ . take ( length)
774
+ . map ( char:: from)
775
+ . collect ( )
776
+ }
777
+
770
778
/// Return randomly sized record batches with:
771
779
/// three sorted int32 columns 'a', 'b', 'c' ranged from 0..DISTINCT as columns
772
780
/// one random int32 column x
773
- fn make_staggered_batches < const STREAM : bool > (
781
+ pub ( crate ) fn make_staggered_batches < const STREAM : bool > (
774
782
len : usize ,
775
783
n_distinct : usize ,
776
784
random_seed : u64 ,
@@ -779,6 +787,7 @@ fn make_staggered_batches<const STREAM: bool>(
779
787
let mut rng = StdRng :: seed_from_u64 ( random_seed) ;
780
788
let mut input123: Vec < ( i32 , i32 , i32 ) > = vec ! [ ( 0 , 0 , 0 ) ; len] ;
781
789
let mut input4: Vec < i32 > = vec ! [ 0 ; len] ;
790
+ let mut input5: Vec < String > = vec ! [ "" . to_string( ) ; len] ;
782
791
input123. iter_mut ( ) . for_each ( |v| {
783
792
* v = (
784
793
rng. gen_range ( 0 ..n_distinct) as i32 ,
@@ -788,17 +797,23 @@ fn make_staggered_batches<const STREAM: bool>(
788
797
} ) ;
789
798
input123. sort ( ) ;
790
799
rng. fill ( & mut input4[ ..] ) ;
800
+ input5. iter_mut ( ) . for_each ( |v| {
801
+ * v = generate_random_string ( & mut rng, 1 ) ;
802
+ } ) ;
803
+ input5. sort ( ) ;
791
804
let input1 = Int32Array :: from_iter_values ( input123. iter ( ) . map ( |k| k. 0 ) ) ;
792
805
let input2 = Int32Array :: from_iter_values ( input123. iter ( ) . map ( |k| k. 1 ) ) ;
793
806
let input3 = Int32Array :: from_iter_values ( input123. iter ( ) . map ( |k| k. 2 ) ) ;
794
807
let input4 = Int32Array :: from_iter_values ( input4) ;
808
+ let input5 = StringArray :: from_iter_values ( input5) ;
795
809
796
810
// split into several record batches
797
811
let mut remainder = RecordBatch :: try_from_iter ( vec ! [
798
812
( "a" , Arc :: new( input1) as ArrayRef ) ,
799
813
( "b" , Arc :: new( input2) as ArrayRef ) ,
800
814
( "c" , Arc :: new( input3) as ArrayRef ) ,
801
815
( "x" , Arc :: new( input4) as ArrayRef ) ,
816
+ ( "string_field" , Arc :: new( input5) as ArrayRef ) ,
802
817
] )
803
818
. unwrap ( ) ;
804
819
@@ -807,6 +822,7 @@ fn make_staggered_batches<const STREAM: bool>(
807
822
while remainder. num_rows ( ) > 0 {
808
823
let batch_size = rng. gen_range ( 0 ..50 ) ;
809
824
if remainder. num_rows ( ) < batch_size {
825
+ batches. push ( remainder) ;
810
826
break ;
811
827
}
812
828
batches. push ( remainder. slice ( 0 , batch_size) ) ;
0 commit comments