|
17 | 17 |
|
18 | 18 | use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
|
19 | 19 | use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
|
20 |
| -use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; |
| 20 | +use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait}; |
21 | 21 | use arrow_buffer::NullBufferBuilder;
|
22 | 22 | use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
|
23 | 23 | use arrow_data::ArrayDataBuilder;
|
@@ -129,6 +129,48 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
|
129 | 129 | self.offsets_builder.append(self.next_offset());
|
130 | 130 | }
|
131 | 131 |
|
| 132 | + /// Appends array values and null to this builder as is |
| 133 | + /// (this means that underlying null values are copied as is). |
| 134 | + #[inline] |
| 135 | + pub fn append_array(&mut self, array: &GenericByteArray<T>) { |
| 136 | + if array.len() == 0 { |
| 137 | + return; |
| 138 | + } |
| 139 | + |
| 140 | + let offsets = array.offsets(); |
| 141 | + |
| 142 | + // If the offsets are contiguous, we can append them directly avoiding the need to align |
| 143 | + // for example, when the first appended array is not sliced (starts at offset 0) |
| 144 | + if self.next_offset() == offsets[0] { |
| 145 | + self.offsets_builder.append_slice(&offsets[1..]); |
| 146 | + } else { |
| 147 | + // Shifting all the offsets |
| 148 | + let shift: T::Offset = self.next_offset() - offsets[0]; |
| 149 | + |
| 150 | + // Creating intermediate offsets instead of pushing each offset is faster |
| 151 | + // (even if we make MutableBuffer to avoid updating length on each push |
| 152 | + // and reserve the necessary capacity, it's still slower) |
| 153 | + let mut intermediate = Vec::with_capacity(offsets.len() - 1); |
| 154 | + |
| 155 | + for &offset in &offsets[1..] { |
| 156 | + intermediate.push(offset + shift) |
| 157 | + } |
| 158 | + |
| 159 | + self.offsets_builder.append_slice(&intermediate); |
| 160 | + } |
| 161 | + |
| 162 | + // Append underlying values, starting from the first offset and ending at the last offset |
| 163 | + self.value_builder.append_slice( |
| 164 | + &array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()], |
| 165 | + ); |
| 166 | + |
| 167 | + if let Some(null_buffer) = array.nulls() { |
| 168 | + self.null_buffer_builder.append_buffer(null_buffer); |
| 169 | + } else { |
| 170 | + self.null_buffer_builder.append_n_non_nulls(array.len()); |
| 171 | + } |
| 172 | + } |
| 173 | + |
132 | 174 | /// Builds the [`GenericByteArray`] and reset this builder.
|
133 | 175 | pub fn finish(&mut self) -> GenericByteArray<T> {
|
134 | 176 | let array_type = T::DATA_TYPE;
|
@@ -358,6 +400,7 @@ mod tests {
|
358 | 400 | use super::*;
|
359 | 401 | use crate::array::Array;
|
360 | 402 | use crate::GenericStringArray;
|
| 403 | + use arrow_buffer::NullBuffer; |
361 | 404 | use std::fmt::Write as _;
|
362 | 405 | use std::io::Write as _;
|
363 | 406 |
|
@@ -593,4 +636,178 @@ mod tests {
|
593 | 636 | &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
|
594 | 637 | )
|
595 | 638 | }
|
| 639 | + |
| 640 | + #[test] |
| 641 | + fn test_append_array_without_nulls() { |
| 642 | + let input = vec![ |
| 643 | + "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", |
| 644 | + "thank", "you", "for", "asking", |
| 645 | + ]; |
| 646 | + let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); |
| 647 | + let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); |
| 648 | + let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); |
| 649 | + |
| 650 | + let mut builder = GenericStringBuilder::<i32>::new(); |
| 651 | + builder.append_array(&arr1); |
| 652 | + builder.append_array(&arr2); |
| 653 | + builder.append_array(&arr3); |
| 654 | + |
| 655 | + let actual = builder.finish(); |
| 656 | + let expected = GenericStringArray::<i32>::from(input); |
| 657 | + |
| 658 | + assert_eq!(actual, expected); |
| 659 | + } |
| 660 | + |
| 661 | + #[test] |
| 662 | + fn test_append_array_with_nulls() { |
| 663 | + let input = vec![ |
| 664 | + Some("hello"), |
| 665 | + None, |
| 666 | + Some("how"), |
| 667 | + None, |
| 668 | + None, |
| 669 | + None, |
| 670 | + None, |
| 671 | + Some("I"), |
| 672 | + Some("am"), |
| 673 | + Some("doing"), |
| 674 | + Some("well"), |
| 675 | + ]; |
| 676 | + let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); |
| 677 | + let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); |
| 678 | + let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); |
| 679 | + |
| 680 | + let mut builder = GenericStringBuilder::<i32>::new(); |
| 681 | + builder.append_array(&arr1); |
| 682 | + builder.append_array(&arr2); |
| 683 | + builder.append_array(&arr3); |
| 684 | + |
| 685 | + let actual = builder.finish(); |
| 686 | + let expected = GenericStringArray::<i32>::from(input); |
| 687 | + |
| 688 | + assert_eq!(actual, expected); |
| 689 | + } |
| 690 | + |
| 691 | + #[test] |
| 692 | + fn test_append_empty_array() { |
| 693 | + let arr = GenericStringArray::<i32>::from(Vec::<&str>::new()); |
| 694 | + let mut builder = GenericStringBuilder::<i32>::new(); |
| 695 | + builder.append_array(&arr); |
| 696 | + let result = builder.finish(); |
| 697 | + assert_eq!(result.len(), 0); |
| 698 | + } |
| 699 | + |
| 700 | + #[test] |
| 701 | + fn test_append_array_with_offset_not_starting_at_0() { |
| 702 | + let input = vec![ |
| 703 | + Some("hello"), |
| 704 | + None, |
| 705 | + Some("how"), |
| 706 | + None, |
| 707 | + None, |
| 708 | + None, |
| 709 | + None, |
| 710 | + Some("I"), |
| 711 | + Some("am"), |
| 712 | + Some("doing"), |
| 713 | + Some("well"), |
| 714 | + ]; |
| 715 | + let full_array = GenericStringArray::<i32>::from(input); |
| 716 | + let sliced = full_array.slice(1, 4); |
| 717 | + |
| 718 | + assert_ne!(sliced.offsets()[0].as_usize(), 0); |
| 719 | + assert_ne!(sliced.offsets().last(), full_array.offsets().last()); |
| 720 | + |
| 721 | + let mut builder = GenericStringBuilder::<i32>::new(); |
| 722 | + builder.append_array(&sliced); |
| 723 | + let actual = builder.finish(); |
| 724 | + |
| 725 | + let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]); |
| 726 | + |
| 727 | + assert_eq!(actual, expected); |
| 728 | + } |
| 729 | + |
| 730 | + #[test] |
| 731 | + fn test_append_underlying_null_values_added_as_is() { |
| 732 | + let input_1_array_with_nulls = { |
| 733 | + let input = vec![ |
| 734 | + "hello", "world", "how", "are", "you", "doing", "today", "I", "am", |
| 735 | + ]; |
| 736 | + let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts(); |
| 737 | + |
| 738 | + GenericStringArray::<i32>::new( |
| 739 | + offsets, |
| 740 | + buffer, |
| 741 | + Some(NullBuffer::from(&[ |
| 742 | + true, false, true, false, false, true, true, true, false, |
| 743 | + ])), |
| 744 | + ) |
| 745 | + }; |
| 746 | + let input_2_array_with_nulls = { |
| 747 | + let input = vec!["doing", "well", "thank", "you", "for", "asking"]; |
| 748 | + let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts(); |
| 749 | + |
| 750 | + GenericStringArray::<i32>::new( |
| 751 | + offsets, |
| 752 | + buffer, |
| 753 | + Some(NullBuffer::from(&[false, false, true, false, true, true])), |
| 754 | + ) |
| 755 | + }; |
| 756 | + |
| 757 | + let mut builder = GenericStringBuilder::<i32>::new(); |
| 758 | + builder.append_array(&input_1_array_with_nulls); |
| 759 | + builder.append_array(&input_2_array_with_nulls); |
| 760 | + |
| 761 | + let actual = builder.finish(); |
| 762 | + let expected = GenericStringArray::<i32>::from(vec![ |
| 763 | + Some("hello"), |
| 764 | + None, // world |
| 765 | + Some("how"), |
| 766 | + None, // are |
| 767 | + None, // you |
| 768 | + Some("doing"), |
| 769 | + Some("today"), |
| 770 | + Some("I"), |
| 771 | + None, // am |
| 772 | + None, // doing |
| 773 | + None, // well |
| 774 | + Some("thank"), |
| 775 | + None, // "you", |
| 776 | + Some("for"), |
| 777 | + Some("asking"), |
| 778 | + ]); |
| 779 | + |
| 780 | + assert_eq!(actual, expected); |
| 781 | + |
| 782 | + let expected_underlying_buffer = Buffer::from( |
| 783 | + [ |
| 784 | + "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", |
| 785 | + "well", "thank", "you", "for", "asking", |
| 786 | + ] |
| 787 | + .join("") |
| 788 | + .as_bytes(), |
| 789 | + ); |
| 790 | + assert_eq!(actual.values(), &expected_underlying_buffer); |
| 791 | + } |
| 792 | + |
| 793 | + #[test] |
| 794 | + fn append_array_with_continues_indices() { |
| 795 | + let input = vec![ |
| 796 | + "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", |
| 797 | + "thank", "you", "for", "asking", |
| 798 | + ]; |
| 799 | + let full_array = GenericStringArray::<i32>::from(input); |
| 800 | + let slice1 = full_array.slice(0, 3); |
| 801 | + let slice2 = full_array.slice(3, 4); |
| 802 | + let slice3 = full_array.slice(7, full_array.len() - 7); |
| 803 | + |
| 804 | + let mut builder = GenericStringBuilder::<i32>::new(); |
| 805 | + builder.append_array(&slice1); |
| 806 | + builder.append_array(&slice2); |
| 807 | + builder.append_array(&slice3); |
| 808 | + |
| 809 | + let actual = builder.finish(); |
| 810 | + |
| 811 | + assert_eq!(actual, full_array); |
| 812 | + } |
596 | 813 | }
|
0 commit comments