Skip to content

Commit 50c7977

Browse files
authored
Reuse on expressions values in HashJoinExec (#14131)
* Reduce duplicated build side experssions evaluations in HashJoinExec * Reuse probe side on expressions values
1 parent 3906c04 commit 50c7977

File tree

1 file changed

+35
-30
lines changed

1 file changed

+35
-30
lines changed

datafusion/physical-plan/src/joins/hash_join.rs

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ struct JoinLeftData {
8787
hash_map: JoinHashMap,
8888
/// The input rows for the build side
8989
batch: RecordBatch,
90+
/// The build side on expressions values
91+
values: Vec<ArrayRef>,
9092
/// Shared bitmap builder for visited left indices
9193
visited_indices_bitmap: SharedBitmapBuilder,
9294
/// Counter of running probe-threads, potentially
@@ -104,13 +106,15 @@ impl JoinLeftData {
104106
fn new(
105107
hash_map: JoinHashMap,
106108
batch: RecordBatch,
109+
values: Vec<ArrayRef>,
107110
visited_indices_bitmap: SharedBitmapBuilder,
108111
probe_threads_counter: AtomicUsize,
109112
reservation: MemoryReservation,
110113
) -> Self {
111114
Self {
112115
hash_map,
113116
batch,
117+
values,
114118
visited_indices_bitmap,
115119
probe_threads_counter,
116120
_reservation: reservation,
@@ -127,6 +131,11 @@ impl JoinLeftData {
127131
&self.batch
128132
}
129133

134+
/// returns a reference to the build side expressions values
135+
fn values(&self) -> &[ArrayRef] {
136+
&self.values
137+
}
138+
130139
/// returns a reference to the visited indices bitmap
131140
fn visited_indices_bitmap(&self) -> &SharedBitmapBuilder {
132141
&self.visited_indices_bitmap
@@ -853,7 +862,6 @@ impl ExecutionPlan for HashJoinExec {
853862

854863
Ok(Box::pin(HashJoinStream {
855864
schema: self.schema(),
856-
on_left,
857865
on_right,
858866
filter: self.filter.clone(),
859867
join_type: self.join_type,
@@ -984,9 +992,18 @@ async fn collect_left_input(
984992
BooleanBufferBuilder::new(0)
985993
};
986994

995+
let left_values = on_left
996+
.iter()
997+
.map(|c| {
998+
c.evaluate(&single_batch)?
999+
.into_array(single_batch.num_rows())
1000+
})
1001+
.collect::<Result<Vec<_>>>()?;
1002+
9871003
let data = JoinLeftData::new(
9881004
hashmap,
9891005
single_batch,
1006+
left_values,
9901007
Mutex::new(visited_indices_bitmap),
9911008
AtomicUsize::new(probe_threads_count),
9921009
reservation,
@@ -1136,6 +1153,8 @@ impl HashJoinStreamState {
11361153
struct ProcessProbeBatchState {
11371154
/// Current probe-side batch
11381155
batch: RecordBatch,
1156+
/// Probe-side on expressions values
1157+
values: Vec<ArrayRef>,
11391158
/// Starting offset for JoinHashMap lookups
11401159
offset: JoinHashMapOffset,
11411160
/// Max joined probe-side index from current batch
@@ -1162,8 +1181,6 @@ impl ProcessProbeBatchState {
11621181
struct HashJoinStream {
11631182
/// Input schema
11641183
schema: Arc<Schema>,
1165-
/// equijoin columns from the left (build side)
1166-
on_left: Vec<PhysicalExprRef>,
11671184
/// equijoin columns from the right (probe side)
11681185
on_right: Vec<PhysicalExprRef>,
11691186
/// optional join filter
@@ -1249,27 +1266,13 @@ impl RecordBatchStream for HashJoinStream {
12491266
#[allow(clippy::too_many_arguments)]
12501267
fn lookup_join_hashmap(
12511268
build_hashmap: &JoinHashMap,
1252-
build_input_buffer: &RecordBatch,
1253-
probe_batch: &RecordBatch,
1254-
build_on: &[PhysicalExprRef],
1255-
probe_on: &[PhysicalExprRef],
1269+
build_side_values: &[ArrayRef],
1270+
probe_side_values: &[ArrayRef],
12561271
null_equals_null: bool,
12571272
hashes_buffer: &[u64],
12581273
limit: usize,
12591274
offset: JoinHashMapOffset,
12601275
) -> Result<(UInt64Array, UInt32Array, Option<JoinHashMapOffset>)> {
1261-
let keys_values = probe_on
1262-
.iter()
1263-
.map(|c| c.evaluate(probe_batch)?.into_array(probe_batch.num_rows()))
1264-
.collect::<Result<Vec<_>>>()?;
1265-
let build_join_values = build_on
1266-
.iter()
1267-
.map(|c| {
1268-
c.evaluate(build_input_buffer)?
1269-
.into_array(build_input_buffer.num_rows())
1270-
})
1271-
.collect::<Result<Vec<_>>>()?;
1272-
12731276
let (probe_indices, build_indices, next_offset) = build_hashmap
12741277
.get_matched_indices_with_limit_offset(hashes_buffer, None, limit, offset);
12751278

@@ -1279,8 +1282,8 @@ fn lookup_join_hashmap(
12791282
let (build_indices, probe_indices) = equal_rows_arr(
12801283
&build_indices,
12811284
&probe_indices,
1282-
&build_join_values,
1283-
&keys_values,
1285+
build_side_values,
1286+
probe_side_values,
12841287
null_equals_null,
12851288
)?;
12861289

@@ -1430,6 +1433,7 @@ impl HashJoinStream {
14301433
self.state =
14311434
HashJoinStreamState::ProcessProbeBatch(ProcessProbeBatchState {
14321435
batch,
1436+
values: keys_values,
14331437
offset: (0, None),
14341438
joined_probe_idx: None,
14351439
});
@@ -1454,10 +1458,8 @@ impl HashJoinStream {
14541458
// get the matched by join keys indices
14551459
let (left_indices, right_indices, next_offset) = lookup_join_hashmap(
14561460
build_side.left_data.hash_map(),
1457-
build_side.left_data.batch(),
1458-
&state.batch,
1459-
&self.on_left,
1460-
&self.on_right,
1461+
build_side.left_data.values(),
1462+
&state.values,
14611463
self.null_equals_null,
14621464
&self.hashes_buffer,
14631465
self.batch_size,
@@ -3297,17 +3299,20 @@ mod tests {
32973299

32983300
let join_hash_map = JoinHashMap::new(hashmap_left, next);
32993301

3302+
let left_keys_values = key_column.evaluate(&left)?.into_array(left.num_rows())?;
33003303
let right_keys_values =
33013304
key_column.evaluate(&right)?.into_array(right.num_rows())?;
33023305
let mut hashes_buffer = vec![0; right.num_rows()];
3303-
create_hashes(&[right_keys_values], &random_state, &mut hashes_buffer)?;
3306+
create_hashes(
3307+
&[Arc::clone(&right_keys_values)],
3308+
&random_state,
3309+
&mut hashes_buffer,
3310+
)?;
33043311

33053312
let (l, r, _) = lookup_join_hashmap(
33063313
&join_hash_map,
3307-
&left,
3308-
&right,
3309-
&[Arc::clone(&key_column)],
3310-
&[key_column],
3314+
&[left_keys_values],
3315+
&[right_keys_values],
33113316
false,
33123317
&hashes_buffer,
33133318
8192,

0 commit comments

Comments
 (0)