Skip to content

Commit d7581ef

Browse files
authored
arrow-ipc: Default to not preserving dict IDs (#6788)
* arrow-ipc: Default to not preserving dict IDs * arrow-integration-testing: Adapt to using default settings Previously the integration tests forced preserving dict IDs in some places and used the default in others. This worked fine previously because preserving dict IDs used to be the default, but it isn't anymore.
1 parent c60ce14 commit d7581ef

File tree

3 files changed

+55
-16
lines changed

3 files changed

+55
-16
lines changed

arrow-integration-testing/src/flight_client_scenarios/integration_test.rs

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use arrow::{
2929
};
3030
use arrow_flight::{
3131
flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient,
32-
utils::flight_data_to_arrow_batch, FlightData, FlightDescriptor, Location, SchemaAsIpc, Ticket,
32+
utils::flight_data_to_arrow_batch, FlightData, FlightDescriptor, IpcMessage, Location, Ticket,
3333
};
3434
use futures::{channel::mpsc, sink::SinkExt, stream, StreamExt};
3535
use tonic::{Request, Streaming};
@@ -72,7 +72,19 @@ async fn upload_data(
7272
let (mut upload_tx, upload_rx) = mpsc::channel(10);
7373

7474
let options = arrow::ipc::writer::IpcWriteOptions::default();
75-
let mut schema_flight_data: FlightData = SchemaAsIpc::new(&schema, &options).into();
75+
let mut dict_tracker =
76+
writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id());
77+
let data_gen = writer::IpcDataGenerator::default();
78+
let data = IpcMessage(
79+
data_gen
80+
.schema_to_bytes_with_dictionary_tracker(&schema, &mut dict_tracker, &options)
81+
.ipc_message
82+
.into(),
83+
);
84+
let mut schema_flight_data = FlightData {
85+
data_header: data.0,
86+
..Default::default()
87+
};
7688
// arrow_flight::utils::flight_data_from_arrow_schema(&schema, &options);
7789
schema_flight_data.flight_descriptor = Some(descriptor.clone());
7890
upload_tx.send(schema_flight_data).await?;
@@ -82,7 +94,14 @@ async fn upload_data(
8294
if let Some((counter, first_batch)) = original_data_iter.next() {
8395
let metadata = counter.to_string().into_bytes();
8496
// Preload the first batch into the channel before starting the request
85-
send_batch(&mut upload_tx, &metadata, first_batch, &options).await?;
97+
send_batch(
98+
&mut upload_tx,
99+
&metadata,
100+
first_batch,
101+
&options,
102+
&mut dict_tracker,
103+
)
104+
.await?;
86105

87106
let outer = client.do_put(Request::new(upload_rx)).await?;
88107
let mut inner = outer.into_inner();
@@ -97,7 +116,14 @@ async fn upload_data(
97116
// Stream the rest of the batches
98117
for (counter, batch) in original_data_iter {
99118
let metadata = counter.to_string().into_bytes();
100-
send_batch(&mut upload_tx, &metadata, batch, &options).await?;
119+
send_batch(
120+
&mut upload_tx,
121+
&metadata,
122+
batch,
123+
&options,
124+
&mut dict_tracker,
125+
)
126+
.await?;
101127

102128
let r = inner
103129
.next()
@@ -124,12 +150,12 @@ async fn send_batch(
124150
metadata: &[u8],
125151
batch: &RecordBatch,
126152
options: &writer::IpcWriteOptions,
153+
dictionary_tracker: &mut writer::DictionaryTracker,
127154
) -> Result {
128155
let data_gen = writer::IpcDataGenerator::default();
129-
let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, true);
130156

131157
let (encoded_dictionaries, encoded_batch) = data_gen
132-
.encoded_batch(batch, &mut dictionary_tracker, options)
158+
.encoded_batch(batch, dictionary_tracker, options)
133159
.expect("DictionaryTracker configured above to not error on replacement");
134160

135161
let dictionary_flight_data: Vec<FlightData> =

arrow-integration-testing/src/flight_server_scenarios/integration_test.rs

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,18 +119,31 @@ impl FlightService for FlightServiceImpl {
119119
.ok_or_else(|| Status::not_found(format!("Could not find flight. {key}")))?;
120120

121121
let options = arrow::ipc::writer::IpcWriteOptions::default();
122+
let mut dictionary_tracker =
123+
writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id());
124+
let data_gen = writer::IpcDataGenerator::default();
125+
let data = IpcMessage(
126+
data_gen
127+
.schema_to_bytes_with_dictionary_tracker(
128+
&flight.schema,
129+
&mut dictionary_tracker,
130+
&options,
131+
)
132+
.ipc_message
133+
.into(),
134+
);
135+
let schema_flight_data = FlightData {
136+
data_header: data.0,
137+
..Default::default()
138+
};
122139

123-
let schema = std::iter::once(Ok(SchemaAsIpc::new(&flight.schema, &options).into()));
140+
let schema = std::iter::once(Ok(schema_flight_data));
124141

125142
let batches = flight
126143
.chunks
127144
.iter()
128145
.enumerate()
129146
.flat_map(|(counter, batch)| {
130-
let data_gen = writer::IpcDataGenerator::default();
131-
let mut dictionary_tracker =
132-
writer::DictionaryTracker::new_with_preserve_dict_id(false, true);
133-
134147
let (encoded_dictionaries, encoded_batch) = data_gen
135148
.encoded_batch(batch, &mut dictionary_tracker, &options)
136149
.expect("DictionaryTracker configured above to not error on replacement");

arrow-ipc/src/writer.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ pub struct IpcWriteOptions {
6464
/// Flag indicating whether the writer should preserve the dictionary IDs defined in the
6565
/// schema or generate unique dictionary IDs internally during encoding.
6666
///
67-
/// Defaults to `true`
67+
/// Defaults to `false`
6868
preserve_dict_id: bool,
6969
}
7070

@@ -113,7 +113,7 @@ impl IpcWriteOptions {
113113
write_legacy_ipc_format,
114114
metadata_version,
115115
batch_compression_type: None,
116-
preserve_dict_id: true,
116+
preserve_dict_id: false,
117117
}),
118118
crate::MetadataVersion::V5 => {
119119
if write_legacy_ipc_format {
@@ -126,7 +126,7 @@ impl IpcWriteOptions {
126126
write_legacy_ipc_format,
127127
metadata_version,
128128
batch_compression_type: None,
129-
preserve_dict_id: true,
129+
preserve_dict_id: false,
130130
})
131131
}
132132
}
@@ -162,7 +162,7 @@ impl Default for IpcWriteOptions {
162162
write_legacy_ipc_format: false,
163163
metadata_version: crate::MetadataVersion::V5,
164164
batch_compression_type: None,
165-
preserve_dict_id: true,
165+
preserve_dict_id: false,
166166
}
167167
}
168168
}
@@ -786,7 +786,7 @@ impl DictionaryTracker {
786786
written: HashMap::new(),
787787
dict_ids: Vec::new(),
788788
error_on_replacement,
789-
preserve_dict_id: true,
789+
preserve_dict_id: false,
790790
}
791791
}
792792

0 commit comments

Comments
 (0)