Skip to content

Commit 2423211

Browse files
committed
Cleanup record skipping logic and tests (apache#2158)
1 parent e096ec7 commit 2423211

File tree

6 files changed

+121
-178
lines changed

6 files changed

+121
-178
lines changed

parquet/src/arrow/array_reader/byte_array.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::arrow::array_reader::{read_records, ArrayReader, set_column_reader};
18+
use crate::arrow::array_reader::{read_records, skip_records, ArrayReader};
1919
use crate::arrow::buffer::offset_buffer::OffsetBuffer;
2020
use crate::arrow::record_reader::buffer::ScalarValue;
2121
use crate::arrow::record_reader::GenericRecordReader;
@@ -120,8 +120,7 @@ impl<I: OffsetSizeTrait + ScalarValue> ArrayReader for ByteArrayReader<I> {
120120
}
121121

122122
fn skip_records(&mut self, num_records: usize) -> Result<usize> {
123-
set_column_reader(&mut self.record_reader, self.pages.as_mut())?;
124-
self.record_reader.skip_records(num_records)
123+
skip_records(&mut self.record_reader, self.pages.as_mut(), num_records)
125124
}
126125

127126
fn get_def_levels(&self) -> Option<&[i16]> {

parquet/src/arrow/array_reader/byte_array_dictionary.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow::buffer::Buffer;
2525
use arrow::datatypes::{ArrowNativeType, DataType as ArrowType};
2626

2727
use crate::arrow::array_reader::byte_array::{ByteArrayDecoder, ByteArrayDecoderPlain};
28-
use crate::arrow::array_reader::{read_records, ArrayReader, set_column_reader};
28+
use crate::arrow::array_reader::{read_records, ArrayReader, skip_records};
2929
use crate::arrow::buffer::{
3030
dictionary_buffer::DictionaryBuffer, offset_buffer::OffsetBuffer,
3131
};
@@ -181,8 +181,7 @@ where
181181
}
182182

183183
fn skip_records(&mut self, num_records: usize) -> Result<usize> {
184-
set_column_reader(&mut self.record_reader, self.pages.as_mut())?;
185-
self.record_reader.skip_records(num_records)
184+
skip_records(&mut self.record_reader, self.pages.as_mut(), num_records)
186185
}
187186

188187
fn get_def_levels(&self) -> Option<&[i16]> {

parquet/src/arrow/array_reader/mod.rs

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ impl RowGroupCollection for Arc<dyn FileReader> {
113113

114114
/// Uses `record_reader` to read up to `batch_size` records from `pages`
115115
///
116-
/// Returns the number of records read, which can be less than batch_size if
116+
/// Returns the number of records read, which can be less than `batch_size` if
117117
/// pages is exhausted.
118118
fn read_records<V, CV>(
119119
record_reader: &mut GenericRecordReader<V, CV>,
@@ -145,29 +145,36 @@ where
145145
Ok(records_read)
146146
}
147147

148-
/// Uses `pages` to set up to `record_reader` 's `column_reader`
148+
/// Uses `record_reader` to skip up to `batch_size` records from`pages`
149149
///
150-
/// If we skip records before all read operation,
151-
/// need set `column_reader` by `set_page_reader`
152-
/// for constructing `def_level_decoder` and `rep_level_decoder`.
153-
fn set_column_reader<V, CV>(
150+
/// Returns the number of records skipped, which can be less than `batch_size` if
151+
/// pages is exhausted
152+
fn skip_records<V, CV>(
154153
record_reader: &mut GenericRecordReader<V, CV>,
155154
pages: &mut dyn PageIterator,
156-
) -> Result<bool>
157-
where
158-
V: ValuesBuffer + Default,
159-
CV: ColumnValueDecoder<Slice = V::Slice>,
155+
batch_size: usize,
156+
) -> Result<usize>
157+
where
158+
V: ValuesBuffer + Default,
159+
CV: ColumnValueDecoder<Slice = V::Slice>,
160160
{
161-
return if record_reader.column_reader().is_none() {
162-
// If we skip records before all read operation
163-
// we need set `column_reader` by `set_page_reader`
164-
if let Some(page_reader) = pages.next() {
165-
record_reader.set_page_reader(page_reader?)?;
166-
Ok(true)
167-
} else {
168-
Ok(false)
161+
let mut records_skipped = 0usize;
162+
while records_skipped < batch_size {
163+
let records_to_read = batch_size - records_skipped;
164+
165+
let records_skipped_once = record_reader.skip_records(records_to_read)?;
166+
records_skipped += records_skipped_once;
167+
168+
// Record reader exhausted
169+
if records_skipped_once < records_to_read {
170+
if let Some(page_reader) = pages.next() {
171+
// Read from new page reader (i.e. column chunk)
172+
record_reader.set_page_reader(page_reader?)?;
173+
} else {
174+
// Page reader also exhausted
175+
break;
176+
}
169177
}
170-
} else {
171-
Ok(true)
172-
};
178+
}
179+
Ok(records_skipped)
173180
}

parquet/src/arrow/array_reader/null_array.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::arrow::array_reader::{read_records, ArrayReader, set_column_reader};
18+
use crate::arrow::array_reader::{read_records, ArrayReader, skip_records};
1919
use crate::arrow::record_reader::buffer::ScalarValue;
2020
use crate::arrow::record_reader::RecordReader;
2121
use crate::column::page::PageIterator;
@@ -97,8 +97,7 @@ where
9797
}
9898

9999
fn skip_records(&mut self, num_records: usize) -> Result<usize> {
100-
set_column_reader(&mut self.record_reader, self.pages.as_mut())?;
101-
self.record_reader.skip_records(num_records)
100+
skip_records(&mut self.record_reader, self.pages.as_mut(), num_records)
102101
}
103102

104103
fn get_def_levels(&self) -> Option<&[i16]> {

parquet/src/arrow/array_reader/primitive_array.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::arrow::array_reader::{read_records, set_column_reader, ArrayReader};
18+
use crate::arrow::array_reader::{read_records, skip_records, ArrayReader};
1919
use crate::arrow::record_reader::buffer::ScalarValue;
2020
use crate::arrow::record_reader::RecordReader;
2121
use crate::arrow::schema::parquet_to_arrow_field;
@@ -222,8 +222,7 @@ where
222222
}
223223

224224
fn skip_records(&mut self, num_records: usize) -> Result<usize> {
225-
set_column_reader(&mut self.record_reader, self.pages.as_mut())?;
226-
self.record_reader.skip_records(num_records)
225+
skip_records(&mut self.record_reader, self.pages.as_mut(), num_records)
227226
}
228227

229228
fn get_def_levels(&self) -> Option<&[i16]> {

0 commit comments

Comments
 (0)