Skip to content

Commit 820843f

Browse files
edmondopalamb
andauthored
Removes Bloom filter for Int8/Int16/Uint8/Uint16 (#9969)
* Removing broken tests * Simplifying tests / removing support for failed tests * Revert "Simplifying tests / removing support for failed tests" This reverts commit 6e50a80. * Fixing tests for real * Apply suggestions from code review Thanks @alamb ! Co-authored-by: Andrew Lamb <[email protected]> --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent fc29c3e commit 820843f

File tree

2 files changed

+26
-32
lines changed

2 files changed

+26
-32
lines changed

datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -232,12 +232,8 @@ impl PruningStatistics for BloomFilterStatistics {
232232
ScalarValue::Float32(Some(v)) => sbbf.check(v),
233233
ScalarValue::Int64(Some(v)) => sbbf.check(v),
234234
ScalarValue::Int32(Some(v)) => sbbf.check(v),
235-
ScalarValue::Int16(Some(v)) => sbbf.check(v),
236-
ScalarValue::Int8(Some(v)) => sbbf.check(v),
237235
ScalarValue::UInt64(Some(v)) => sbbf.check(v),
238236
ScalarValue::UInt32(Some(v)) => sbbf.check(v),
239-
ScalarValue::UInt16(Some(v)) => sbbf.check(v),
240-
ScalarValue::UInt8(Some(v)) => sbbf.check(v),
241237
ScalarValue::Decimal128(Some(v), p, s) => match parquet_type {
242238
Type::INT32 => {
243239
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42

datafusion/core/tests/parquet/row_group_pruning.rs

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ async fn prune_disabled() {
290290
// https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests pass
291291
// if and only if Bloom filters on Int8 and Int16 columns are still buggy.
292292
macro_rules! int_tests {
293-
($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => {
293+
($bits:expr) => {
294294
paste::item! {
295295
#[tokio::test]
296296
async fn [<prune_int $bits _lt >]() {
@@ -329,9 +329,9 @@ macro_rules! int_tests {
329329
.with_expected_errors(Some(0))
330330
.with_matched_by_stats(Some(1))
331331
.with_pruned_by_stats(Some(3))
332-
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
333-
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
334-
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
332+
.with_matched_by_bloom_filter(Some(1))
333+
.with_pruned_by_bloom_filter(Some(0))
334+
.with_expected_rows(1)
335335
.test_row_group_prune()
336336
.await;
337337
}
@@ -343,9 +343,9 @@ macro_rules! int_tests {
343343
.with_expected_errors(Some(0))
344344
.with_matched_by_stats(Some(1))
345345
.with_pruned_by_stats(Some(3))
346-
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
347-
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
348-
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
346+
.with_matched_by_bloom_filter(Some(1))
347+
.with_pruned_by_bloom_filter(Some(0))
348+
.with_expected_rows(1)
349349
.test_row_group_prune()
350350
.await;
351351
}
@@ -404,9 +404,9 @@ macro_rules! int_tests {
404404
.with_expected_errors(Some(0))
405405
.with_matched_by_stats(Some(1))
406406
.with_pruned_by_stats(Some(3))
407-
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
408-
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
409-
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
407+
.with_matched_by_bloom_filter(Some(1))
408+
.with_pruned_by_bloom_filter(Some(0))
409+
.with_expected_rows(1)
410410
.test_row_group_prune()
411411
.await;
412412
}
@@ -447,17 +447,16 @@ macro_rules! int_tests {
447447
};
448448
}
449449

450-
int_tests!(8, correct_bloom_filters: false);
451-
int_tests!(16, correct_bloom_filters: false);
452-
int_tests!(32, correct_bloom_filters: true);
453-
int_tests!(64, correct_bloom_filters: true);
450+
// int8/int16 are incorrect: https://github.com/apache/arrow-datafusion/issues/9779
451+
int_tests!(32);
452+
int_tests!(64);
454453

455454
// $bits: number of bits of the integer to test (8, 16, 32, 64)
456455
// $correct_bloom_filters: if false, replicates the
457456
// https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests pass
458457
// if and only if Bloom filters on UInt8 and UInt16 columns are still buggy.
459458
macro_rules! uint_tests {
460-
($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => {
459+
($bits:expr) => {
461460
paste::item! {
462461
#[tokio::test]
463462
async fn [<prune_uint $bits _lt >]() {
@@ -482,9 +481,9 @@ macro_rules! uint_tests {
482481
.with_expected_errors(Some(0))
483482
.with_matched_by_stats(Some(1))
484483
.with_pruned_by_stats(Some(3))
485-
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
486-
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
487-
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
484+
.with_matched_by_bloom_filter(Some(1))
485+
.with_pruned_by_bloom_filter(Some(0))
486+
.with_expected_rows(1)
488487
.test_row_group_prune()
489488
.await;
490489
}
@@ -496,9 +495,9 @@ macro_rules! uint_tests {
496495
.with_expected_errors(Some(0))
497496
.with_matched_by_stats(Some(1))
498497
.with_pruned_by_stats(Some(3))
499-
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
500-
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
501-
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
498+
.with_matched_by_bloom_filter(Some(1))
499+
.with_pruned_by_bloom_filter(Some(0))
500+
.with_expected_rows(1)
502501
.test_row_group_prune()
503502
.await;
504503
}
@@ -542,9 +541,9 @@ macro_rules! uint_tests {
542541
.with_expected_errors(Some(0))
543542
.with_matched_by_stats(Some(1))
544543
.with_pruned_by_stats(Some(3))
545-
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
546-
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
547-
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
544+
.with_matched_by_bloom_filter(Some(1))
545+
.with_pruned_by_bloom_filter(Some(0))
546+
.with_expected_rows(1)
548547
.test_row_group_prune()
549548
.await;
550549
}
@@ -585,10 +584,9 @@ macro_rules! uint_tests {
585584
};
586585
}
587586

588-
uint_tests!(8, correct_bloom_filters: false);
589-
uint_tests!(16, correct_bloom_filters: false);
590-
uint_tests!(32, correct_bloom_filters: true);
591-
uint_tests!(64, correct_bloom_filters: true);
587+
// uint8/uint16 are incorrect: https://github.com/apache/arrow-datafusion/issues/9779
588+
uint_tests!(32);
589+
uint_tests!(64);
592590

593591
#[tokio::test]
594592
async fn prune_int32_eq_large_in_list() {

0 commit comments

Comments
 (0)