Skip to content

Commit a5af643

Browse files
authored
Improve concat performance, and add append_array for some array builder implementations (#7309)
* feat: add `append_buffer` for `NullBufferBuilder` * feat: add `append_array` for `PrimitiveBuilder` * feat: add `append_array` for `BooleanBuilder` * test: add test that the underlying null values are added as is * wip * format and lint * add special implementation for concat primitives and booleans improving perf by 50% * add more tests for generic bytes builder * add special implementation for bytes in concat * manually concat primitives * add large array impl * wip * remove unsafe API and use primitive builder in concat * lint and format * fix concat primitives to use the input array data type * format * add back the capacity for binary because dictionary call concat_fallback * add tests and update comment * extract benchmark changes to different PR #7376
1 parent 5e6c19a commit a5af643

File tree

4 files changed

+411
-4
lines changed

4 files changed

+411
-4
lines changed

arrow-array/src/builder/boolean_builder.rs

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use crate::builder::{ArrayBuilder, BooleanBufferBuilder};
19-
use crate::{ArrayRef, BooleanArray};
19+
use crate::{Array, ArrayRef, BooleanArray};
2020
use arrow_buffer::Buffer;
2121
use arrow_buffer::NullBufferBuilder;
2222
use arrow_data::ArrayData;
@@ -146,6 +146,18 @@ impl BooleanBuilder {
146146
}
147147
}
148148

149+
/// Appends array values and null to this builder as is
150+
/// (this means that underlying null values are copied as is).
151+
#[inline]
152+
pub fn append_array(&mut self, array: &BooleanArray) {
153+
self.values_builder.append_buffer(array.values());
154+
if let Some(null_buffer) = array.nulls() {
155+
self.null_buffer_builder.append_buffer(null_buffer);
156+
} else {
157+
self.null_buffer_builder.append_n_non_nulls(array.len());
158+
}
159+
}
160+
149161
/// Builds the [BooleanArray] and reset this builder.
150162
pub fn finish(&mut self) -> BooleanArray {
151163
let len = self.len();
@@ -232,6 +244,7 @@ impl Extend<Option<bool>> for BooleanBuilder {
232244
mod tests {
233245
use super::*;
234246
use crate::Array;
247+
use arrow_buffer::{BooleanBuffer, NullBuffer};
235248

236249
#[test]
237250
fn test_boolean_array_builder() {
@@ -346,4 +359,50 @@ mod tests {
346359
let values = array.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
347360
assert_eq!(&values, &[true, true, true, false, false])
348361
}
362+
363+
#[test]
364+
fn test_append_array() {
365+
let input = vec![
366+
Some(true),
367+
None,
368+
Some(true),
369+
None,
370+
Some(false),
371+
None,
372+
None,
373+
None,
374+
Some(false),
375+
Some(false),
376+
Some(false),
377+
Some(true),
378+
Some(false),
379+
];
380+
let arr1 = BooleanArray::from(input[..5].to_vec());
381+
let arr2 = BooleanArray::from(input[5..8].to_vec());
382+
let arr3 = BooleanArray::from(input[8..].to_vec());
383+
384+
let mut builder = BooleanBuilder::new();
385+
builder.append_array(&arr1);
386+
builder.append_array(&arr2);
387+
builder.append_array(&arr3);
388+
let actual = builder.finish();
389+
let expected = BooleanArray::from(input);
390+
391+
assert_eq!(actual, expected);
392+
}
393+
394+
#[test]
395+
fn test_append_array_add_underlying_null_values() {
396+
let array = BooleanArray::new(
397+
BooleanBuffer::from(vec![true, false, true, false]),
398+
Some(NullBuffer::from(&[true, true, false, false])),
399+
);
400+
401+
let mut builder = BooleanBuilder::new();
402+
builder.append_array(&array);
403+
let actual = builder.finish();
404+
405+
assert_eq!(actual, array);
406+
assert_eq!(actual.values(), array.values())
407+
}
349408
}

arrow-array/src/builder/generic_bytes_builder.rs

Lines changed: 218 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
1919
use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
20-
use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait};
20+
use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
2121
use arrow_buffer::NullBufferBuilder;
2222
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
2323
use arrow_data::ArrayDataBuilder;
@@ -129,6 +129,48 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
129129
self.offsets_builder.append(self.next_offset());
130130
}
131131

132+
/// Appends array values and null to this builder as is
133+
/// (this means that underlying null values are copied as is).
134+
#[inline]
135+
pub fn append_array(&mut self, array: &GenericByteArray<T>) {
136+
if array.len() == 0 {
137+
return;
138+
}
139+
140+
let offsets = array.offsets();
141+
142+
// If the offsets are contiguous, we can append them directly avoiding the need to align
143+
// for example, when the first appended array is not sliced (starts at offset 0)
144+
if self.next_offset() == offsets[0] {
145+
self.offsets_builder.append_slice(&offsets[1..]);
146+
} else {
147+
// Shifting all the offsets
148+
let shift: T::Offset = self.next_offset() - offsets[0];
149+
150+
// Creating intermediate offsets instead of pushing each offset is faster
151+
// (even if we make MutableBuffer to avoid updating length on each push
152+
// and reserve the necessary capacity, it's still slower)
153+
let mut intermediate = Vec::with_capacity(offsets.len() - 1);
154+
155+
for &offset in &offsets[1..] {
156+
intermediate.push(offset + shift)
157+
}
158+
159+
self.offsets_builder.append_slice(&intermediate);
160+
}
161+
162+
// Append underlying values, starting from the first offset and ending at the last offset
163+
self.value_builder.append_slice(
164+
&array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()],
165+
);
166+
167+
if let Some(null_buffer) = array.nulls() {
168+
self.null_buffer_builder.append_buffer(null_buffer);
169+
} else {
170+
self.null_buffer_builder.append_n_non_nulls(array.len());
171+
}
172+
}
173+
132174
/// Builds the [`GenericByteArray`] and reset this builder.
133175
pub fn finish(&mut self) -> GenericByteArray<T> {
134176
let array_type = T::DATA_TYPE;
@@ -358,6 +400,7 @@ mod tests {
358400
use super::*;
359401
use crate::array::Array;
360402
use crate::GenericStringArray;
403+
use arrow_buffer::NullBuffer;
361404
use std::fmt::Write as _;
362405
use std::io::Write as _;
363406

@@ -593,4 +636,178 @@ mod tests {
593636
&["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
594637
)
595638
}
639+
640+
#[test]
641+
fn test_append_array_without_nulls() {
642+
let input = vec![
643+
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
644+
"thank", "you", "for", "asking",
645+
];
646+
let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
647+
let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
648+
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
649+
650+
let mut builder = GenericStringBuilder::<i32>::new();
651+
builder.append_array(&arr1);
652+
builder.append_array(&arr2);
653+
builder.append_array(&arr3);
654+
655+
let actual = builder.finish();
656+
let expected = GenericStringArray::<i32>::from(input);
657+
658+
assert_eq!(actual, expected);
659+
}
660+
661+
#[test]
662+
fn test_append_array_with_nulls() {
663+
let input = vec![
664+
Some("hello"),
665+
None,
666+
Some("how"),
667+
None,
668+
None,
669+
None,
670+
None,
671+
Some("I"),
672+
Some("am"),
673+
Some("doing"),
674+
Some("well"),
675+
];
676+
let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
677+
let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
678+
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
679+
680+
let mut builder = GenericStringBuilder::<i32>::new();
681+
builder.append_array(&arr1);
682+
builder.append_array(&arr2);
683+
builder.append_array(&arr3);
684+
685+
let actual = builder.finish();
686+
let expected = GenericStringArray::<i32>::from(input);
687+
688+
assert_eq!(actual, expected);
689+
}
690+
691+
#[test]
692+
fn test_append_empty_array() {
693+
let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
694+
let mut builder = GenericStringBuilder::<i32>::new();
695+
builder.append_array(&arr);
696+
let result = builder.finish();
697+
assert_eq!(result.len(), 0);
698+
}
699+
700+
#[test]
701+
fn test_append_array_with_offset_not_starting_at_0() {
702+
let input = vec![
703+
Some("hello"),
704+
None,
705+
Some("how"),
706+
None,
707+
None,
708+
None,
709+
None,
710+
Some("I"),
711+
Some("am"),
712+
Some("doing"),
713+
Some("well"),
714+
];
715+
let full_array = GenericStringArray::<i32>::from(input);
716+
let sliced = full_array.slice(1, 4);
717+
718+
assert_ne!(sliced.offsets()[0].as_usize(), 0);
719+
assert_ne!(sliced.offsets().last(), full_array.offsets().last());
720+
721+
let mut builder = GenericStringBuilder::<i32>::new();
722+
builder.append_array(&sliced);
723+
let actual = builder.finish();
724+
725+
let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]);
726+
727+
assert_eq!(actual, expected);
728+
}
729+
730+
#[test]
731+
fn test_append_underlying_null_values_added_as_is() {
732+
let input_1_array_with_nulls = {
733+
let input = vec![
734+
"hello", "world", "how", "are", "you", "doing", "today", "I", "am",
735+
];
736+
let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
737+
738+
GenericStringArray::<i32>::new(
739+
offsets,
740+
buffer,
741+
Some(NullBuffer::from(&[
742+
true, false, true, false, false, true, true, true, false,
743+
])),
744+
)
745+
};
746+
let input_2_array_with_nulls = {
747+
let input = vec!["doing", "well", "thank", "you", "for", "asking"];
748+
let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
749+
750+
GenericStringArray::<i32>::new(
751+
offsets,
752+
buffer,
753+
Some(NullBuffer::from(&[false, false, true, false, true, true])),
754+
)
755+
};
756+
757+
let mut builder = GenericStringBuilder::<i32>::new();
758+
builder.append_array(&input_1_array_with_nulls);
759+
builder.append_array(&input_2_array_with_nulls);
760+
761+
let actual = builder.finish();
762+
let expected = GenericStringArray::<i32>::from(vec![
763+
Some("hello"),
764+
None, // world
765+
Some("how"),
766+
None, // are
767+
None, // you
768+
Some("doing"),
769+
Some("today"),
770+
Some("I"),
771+
None, // am
772+
None, // doing
773+
None, // well
774+
Some("thank"),
775+
None, // "you",
776+
Some("for"),
777+
Some("asking"),
778+
]);
779+
780+
assert_eq!(actual, expected);
781+
782+
let expected_underlying_buffer = Buffer::from(
783+
[
784+
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing",
785+
"well", "thank", "you", "for", "asking",
786+
]
787+
.join("")
788+
.as_bytes(),
789+
);
790+
assert_eq!(actual.values(), &expected_underlying_buffer);
791+
}
792+
793+
#[test]
794+
fn append_array_with_continues_indices() {
795+
let input = vec![
796+
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
797+
"thank", "you", "for", "asking",
798+
];
799+
let full_array = GenericStringArray::<i32>::from(input);
800+
let slice1 = full_array.slice(0, 3);
801+
let slice2 = full_array.slice(3, 4);
802+
let slice3 = full_array.slice(7, full_array.len() - 7);
803+
804+
let mut builder = GenericStringBuilder::<i32>::new();
805+
builder.append_array(&slice1);
806+
builder.append_array(&slice2);
807+
builder.append_array(&slice3);
808+
809+
let actual = builder.finish();
810+
811+
assert_eq!(actual, full_array);
812+
}
596813
}

0 commit comments

Comments
 (0)