diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs index 302359984..212b06fc1 100644 --- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs @@ -31,7 +31,7 @@ pub(crate) struct InclusiveMetricsEvaluator<'a> { } impl<'a> InclusiveMetricsEvaluator<'a> { - fn new(data_file: &'a DataFile) -> Self { + pub fn new(data_file: &'a DataFile) -> Self { InclusiveMetricsEvaluator { data_file } } @@ -52,6 +52,18 @@ impl<'a> InclusiveMetricsEvaluator<'a> { visit(&mut evaluator, filter) } + pub(crate) fn evaluate( + &mut self, + filter: &'a BoundPredicate, + include_empty_files: bool, + ) -> crate::Result { + if !include_empty_files && self.data_file.record_count == 0 { + return ROWS_CANNOT_MATCH; + } + + visit(self, filter) + } + fn nan_count(&self, field_id: i32) -> Option<&u64> { self.data_file.nan_value_counts.get(&field_id) } diff --git a/crates/iceberg/src/expr/visitors/mod.rs b/crates/iceberg/src/expr/visitors/mod.rs index 0066bbc6d..bfc16c31d 100644 --- a/crates/iceberg/src/expr/visitors/mod.rs +++ b/crates/iceberg/src/expr/visitors/mod.rs @@ -21,6 +21,7 @@ pub(crate) mod inclusive_metrics_evaluator; pub(crate) mod inclusive_projection; pub(crate) mod manifest_evaluator; pub(crate) mod page_index_evaluator; +pub(crate) mod partition_metrics_evaluator; pub(crate) mod row_group_metrics_evaluator; pub(crate) mod strict_metrics_evaluator; pub(crate) mod strict_projection; diff --git a/crates/iceberg/src/expr/visitors/partition_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/partition_metrics_evaluator.rs new file mode 100644 index 000000000..b96f61fa3 --- /dev/null +++ b/crates/iceberg/src/expr/visitors/partition_metrics_evaluator.rs @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; + +use super::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; +use super::strict_metrics_evaluator::StrictMetricsEvaluator; +use crate::expr::BoundPredicate; +use crate::spec::{DataFile, PartitionSpec, Schema, Struct}; +use crate::Error; + +/// An evaluator that checks whether rows in a file may/must match a given expression +/// this class first partially evaluates the provided expression using the partition value +/// and then checks the remaining part of the expression using metrics evaluators. +#[allow(dead_code)] +struct PartitionAndMetricsEvaluator<'a> { + schema: Schema, + metrics_evaluator: HashMap, StrictMetricsEvaluator<'a>)>, +} + +// TODO: Implement residual visitor +#[allow(dead_code)] +impl<'a> PartitionAndMetricsEvaluator<'a> { + fn new(schema: Schema, _partition_spec: PartitionSpec, _predicate: BoundPredicate) -> Self { + PartitionAndMetricsEvaluator { + schema, + metrics_evaluator: HashMap::new(), + } + } + + // Retrieve cached `InclusiveMetricsEvaluator` and `StrictMetricsEvaluator` + // by partition. + fn get_metrics_evaluator<'b>( + &'b mut self, + data_file: &'b DataFile, + ) -> &'b mut (InclusiveMetricsEvaluator<'a>, StrictMetricsEvaluator<'a>) + where + 'b: 'a, + { + let partition = data_file.partition(); + + if !self.metrics_evaluator.contains_key(partition) { + let inclusive = InclusiveMetricsEvaluator::new(data_file); + let strict = StrictMetricsEvaluator::new(data_file); + self.metrics_evaluator + .insert(partition.clone(), (inclusive, strict)); + } + + self.metrics_evaluator.get_mut(partition).unwrap() + } + + pub fn rows_might_match<'b>( + &'b mut self, + filter: &'b BoundPredicate, + data_file: &'a DataFile, + ) -> Result + where + 'b: 'a, + { + let (inclusive, _) = self.get_metrics_evaluator(data_file); + inclusive.evaluate(filter, false) + } + + pub fn rows_must_match<'b>( + &'b mut self, + filter: &'b BoundPredicate, + data_file: &'a DataFile, + ) -> Result + where + 'b: 'a, + { + let (_, strict) = self.get_metrics_evaluator(data_file); + strict.evaluate(filter) + } +} diff --git a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs index 841b743e5..65a8774d7 100644 --- a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs @@ -40,9 +40,9 @@ pub(crate) struct StrictMetricsEvaluator<'a> { data_file: &'a DataFile, } +#[allow(dead_code)] impl<'a> StrictMetricsEvaluator<'a> { - #[allow(dead_code)] - fn new(data_file: &'a DataFile) -> Self { + pub fn new(data_file: &'a DataFile) -> Self { StrictMetricsEvaluator { data_file } } @@ -50,7 +50,6 @@ impl<'a> StrictMetricsEvaluator<'a> { /// provided [`DataFile`]'s metrics. Used by [`TableScan`] to /// see if this `DataFile` contains data that could match /// the scan's filter. - #[allow(dead_code)] pub(crate) fn eval(filter: &'a BoundPredicate, data_file: &'a DataFile) -> crate::Result { if data_file.record_count == 0 { return ROWS_MUST_MATCH; @@ -60,6 +59,15 @@ impl<'a> StrictMetricsEvaluator<'a> { visit(&mut evaluator, filter) } + /// Evaluate filter using `StrictMetricsEvaluator` instance + pub(crate) fn evaluate(&mut self, filter: &'a BoundPredicate) -> crate::Result { + if self.data_file.record_count == 0 { + return ROWS_MUST_MATCH; + } + + visit(self, filter) + } + fn nan_count(&self, field_id: i32) -> Option<&u64> { self.data_file.nan_value_counts.get(&field_id) }