@@ -42,11 +42,11 @@ use arrow_array::{Array, ArrayRef};
42
42
use arrow_schema:: { DataType , Schema , SchemaRef , TimeUnit } ;
43
43
use datafusion_common:: hash_utils:: create_hashes;
44
44
use datafusion_common:: { not_impl_err, DataFusionError , Result } ;
45
- use datafusion_execution:: memory_pool:: proxy:: { RawTableAllocExt , VecAllocExt } ;
45
+ use datafusion_execution:: memory_pool:: proxy:: { HashTableAllocExt , VecAllocExt } ;
46
46
use datafusion_expr:: EmitTo ;
47
47
use datafusion_physical_expr:: binary_map:: OutputType ;
48
48
49
- use hashbrown:: raw :: RawTable ;
49
+ use hashbrown:: hash_table :: HashTable ;
50
50
51
51
const NON_INLINED_FLAG : u64 = 0x8000000000000000 ;
52
52
const VALUE_MASK : u64 = 0x7FFFFFFFFFFFFFFF ;
@@ -180,7 +180,7 @@ pub struct GroupValuesColumn<const STREAMING: bool> {
180
180
/// And we use [`GroupIndexView`] to represent such `group indices` in table.
181
181
///
182
182
///
183
- map : RawTable < ( u64 , GroupIndexView ) > ,
183
+ map : HashTable < ( u64 , GroupIndexView ) > ,
184
184
185
185
/// The size of `map` in bytes
186
186
map_size : usize ,
@@ -261,7 +261,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
261
261
262
262
/// Create a new instance of GroupValuesColumn if supported for the specified schema
263
263
pub fn try_new ( schema : SchemaRef ) -> Result < Self > {
264
- let map = RawTable :: with_capacity ( 0 ) ;
264
+ let map = HashTable :: with_capacity ( 0 ) ;
265
265
Ok ( Self {
266
266
schema,
267
267
map,
@@ -338,7 +338,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
338
338
for ( row, & target_hash) in batch_hashes. iter ( ) . enumerate ( ) {
339
339
let entry = self
340
340
. map
341
- . get_mut ( target_hash, |( exist_hash, group_idx_view) | {
341
+ . find_mut ( target_hash, |( exist_hash, group_idx_view) | {
342
342
// It is ensured to be inlined in `scalarized_intern`
343
343
debug_assert ! ( !group_idx_view. is_non_inlined( ) ) ;
344
344
@@ -506,7 +506,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
506
506
for ( row, & target_hash) in batch_hashes. iter ( ) . enumerate ( ) {
507
507
let entry = self
508
508
. map
509
- . get ( target_hash, |( exist_hash, _) | target_hash == * exist_hash) ;
509
+ . find ( target_hash, |( exist_hash, _) | target_hash == * exist_hash) ;
510
510
511
511
let Some ( ( _, group_index_view) ) = entry else {
512
512
// 1. Bucket not found case
@@ -733,7 +733,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
733
733
734
734
for & row in & self . vectorized_operation_buffers . remaining_row_indices {
735
735
let target_hash = batch_hashes[ row] ;
736
- let entry = map. get_mut ( target_hash, |( exist_hash, _) | {
736
+ let entry = map. find_mut ( target_hash, |( exist_hash, _) | {
737
737
// Somewhat surprisingly, this closure can be called even if the
738
738
// hash doesn't match, so check the hash first with an integer
739
739
// comparison first avoid the more expensive comparison with
@@ -852,7 +852,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
852
852
/// Return group indices of the hash, also if its `group_index_view` is non-inlined
853
853
#[ cfg( test) ]
854
854
fn get_indices_by_hash ( & self , hash : u64 ) -> Option < ( Vec < usize > , GroupIndexView ) > {
855
- let entry = self . map . get ( hash, |( exist_hash, _) | hash == * exist_hash) ;
855
+ let entry = self . map . find ( hash, |( exist_hash, _) | hash == * exist_hash) ;
856
856
857
857
match entry {
858
858
Some ( ( _, group_index_view) ) => {
@@ -1091,67 +1091,63 @@ impl<const STREAMING: bool> GroupValues for GroupValuesColumn<STREAMING> {
1091
1091
. collect :: < Vec < _ > > ( ) ;
1092
1092
let mut next_new_list_offset = 0 ;
1093
1093
1094
- // SAFETY: self.map outlives iterator and is not modified concurrently
1095
- unsafe {
1096
- for bucket in self . map . iter ( ) {
1097
- // In non-streaming case, we need to check if the `group index view`
1098
- // is `inlined` or `non-inlined`
1099
- if !STREAMING && bucket. as_ref ( ) . 1 . is_non_inlined ( ) {
1100
- // Non-inlined case
1101
- // We take `group_index_list` from `old_group_index_lists`
1102
-
1103
- // list_offset is incrementally
1104
- self . emit_group_index_list_buffer . clear ( ) ;
1105
- let list_offset = bucket. as_ref ( ) . 1 . value ( ) as usize ;
1106
- for group_index in self . group_index_lists [ list_offset] . iter ( )
1107
- {
1108
- if let Some ( remaining) = group_index. checked_sub ( n) {
1109
- self . emit_group_index_list_buffer . push ( remaining) ;
1110
- }
1094
+ self . map . retain ( |( _exist_hash, group_idx_view) | {
1095
+ // In non-streaming case, we need to check if the `group index view`
1096
+ // is `inlined` or `non-inlined`
1097
+ if !STREAMING && group_idx_view. is_non_inlined ( ) {
1098
+ // Non-inlined case
1099
+ // We take `group_index_list` from `old_group_index_lists`
1100
+
1101
+ // list_offset is incrementally
1102
+ self . emit_group_index_list_buffer . clear ( ) ;
1103
+ let list_offset = group_idx_view. value ( ) as usize ;
1104
+ for group_index in self . group_index_lists [ list_offset] . iter ( ) {
1105
+ if let Some ( remaining) = group_index. checked_sub ( n) {
1106
+ self . emit_group_index_list_buffer . push ( remaining) ;
1111
1107
}
1112
-
1113
- // The possible results:
1114
- // - `new_group_index_list` is empty, we should erase this bucket
1115
- // - only one value in `new_group_index_list`, switch the `view` to `inlined`
1116
- // - still multiple values in `new_group_index_list`, build and set the new `unlined view`
1117
- if self . emit_group_index_list_buffer . is_empty ( ) {
1118
- self . map . erase ( bucket) ;
1119
- } else if self . emit_group_index_list_buffer . len ( ) == 1 {
1120
- let group_index =
1121
- self . emit_group_index_list_buffer . first ( ) . unwrap ( ) ;
1122
- bucket. as_mut ( ) . 1 =
1123
- GroupIndexView :: new_inlined ( * group_index as u64 ) ;
1124
- } else {
1125
- let group_index_list =
1126
- & mut self . group_index_lists [ next_new_list_offset] ;
1127
- group_index_list. clear ( ) ;
1128
- group_index_list
1129
- . extend ( self . emit_group_index_list_buffer . iter ( ) ) ;
1130
- bucket. as_mut ( ) . 1 = GroupIndexView :: new_non_inlined (
1131
- next_new_list_offset as u64 ,
1132
- ) ;
1133
- next_new_list_offset += 1 ;
1134
- }
1135
-
1136
- continue ;
1137
1108
}
1138
1109
1110
+ // The possible results:
1111
+ // - `new_group_index_list` is empty, we should erase this bucket
1112
+ // - only one value in `new_group_index_list`, switch the `view` to `inlined`
1113
+ // - still multiple values in `new_group_index_list`, build and set the new `unlined view`
1114
+ if self . emit_group_index_list_buffer . is_empty ( ) {
1115
+ false
1116
+ } else if self . emit_group_index_list_buffer . len ( ) == 1 {
1117
+ let group_index =
1118
+ self . emit_group_index_list_buffer . first ( ) . unwrap ( ) ;
1119
+ * group_idx_view =
1120
+ GroupIndexView :: new_inlined ( * group_index as u64 ) ;
1121
+ true
1122
+ } else {
1123
+ let group_index_list =
1124
+ & mut self . group_index_lists [ next_new_list_offset] ;
1125
+ group_index_list. clear ( ) ;
1126
+ group_index_list
1127
+ . extend ( self . emit_group_index_list_buffer . iter ( ) ) ;
1128
+ * group_idx_view = GroupIndexView :: new_non_inlined (
1129
+ next_new_list_offset as u64 ,
1130
+ ) ;
1131
+ next_new_list_offset += 1 ;
1132
+ true
1133
+ }
1134
+ } else {
1139
1135
// In `streaming case`, the `group index view` is ensured to be `inlined`
1140
- debug_assert ! ( !bucket . as_ref ( ) . 1 . is_non_inlined( ) ) ;
1136
+ debug_assert ! ( !group_idx_view . is_non_inlined( ) ) ;
1141
1137
1142
1138
// Inlined case, we just decrement group index by n)
1143
- let group_index = bucket . as_ref ( ) . 1 . value ( ) as usize ;
1139
+ let group_index = group_idx_view . value ( ) as usize ;
1144
1140
match group_index. checked_sub ( n) {
1145
1141
// Group index was >= n, shift value down
1146
1142
Some ( sub) => {
1147
- bucket . as_mut ( ) . 1 =
1148
- GroupIndexView :: new_inlined ( sub as u64 )
1143
+ * group_idx_view = GroupIndexView :: new_inlined ( sub as u64 ) ;
1144
+ true
1149
1145
}
1150
1146
// Group index was < n, so remove from table
1151
- None => self . map . erase ( bucket ) ,
1147
+ None => false ,
1152
1148
}
1153
1149
}
1154
- }
1150
+ } ) ;
1155
1151
1156
1152
if !STREAMING {
1157
1153
self . group_index_lists . truncate ( next_new_list_offset) ;
@@ -1243,7 +1239,7 @@ mod tests {
1243
1239
use arrow:: { compute:: concat_batches, util:: pretty:: pretty_format_batches} ;
1244
1240
use arrow_array:: { ArrayRef , Int64Array , RecordBatch , StringArray , StringViewArray } ;
1245
1241
use arrow_schema:: { DataType , Field , Schema , SchemaRef } ;
1246
- use datafusion_common:: utils:: proxy:: RawTableAllocExt ;
1242
+ use datafusion_common:: utils:: proxy:: HashTableAllocExt ;
1247
1243
use datafusion_expr:: EmitTo ;
1248
1244
1249
1245
use crate :: aggregates:: group_values:: {
0 commit comments