Skip to content

Commit b31ebcc

Browse files
authored
feat(puffin): Make Puffin APIs public (#1165)
## Which issue does this PR close? Part of #744 ## What changes are included in this PR? - Make Puffin APIs public - Turn dead-code warning on (disabled earlier to allow for private development) ## Are these changes tested? N/A
1 parent 3018138 commit b31ebcc

File tree

6 files changed

+123
-37
lines changed

6 files changed

+123
-37
lines changed

crates/iceberg/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,4 @@ mod utils;
8888
pub mod writer;
8989

9090
mod delete_vector;
91-
mod puffin;
91+
pub mod puffin;

crates/iceberg/src/puffin/blob.rs

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,53 @@
1818
use std::collections::HashMap;
1919

2020
/// A serialized form of a "compact" Theta sketch produced by the Apache DataSketches library.
21-
pub(crate) const APACHE_DATASKETCHES_THETA_V1: &str = "apache-datasketches-theta-v1";
21+
pub const APACHE_DATASKETCHES_THETA_V1: &str = "apache-datasketches-theta-v1";
2222

2323
/// The blob
2424
#[derive(Debug, PartialEq, Clone)]
25-
pub(crate) struct Blob {
26-
/// See blob types: https://iceberg.apache.org/puffin-spec/#blob-types
25+
pub struct Blob {
2726
pub(crate) r#type: String,
28-
/// List of field IDs the blob was computed for; the order of items is used to compute sketches stored in the blob.
2927
pub(crate) fields: Vec<i32>,
30-
/// ID of the Iceberg table's snapshot the blob was computed from
3128
pub(crate) snapshot_id: i64,
32-
/// Sequence number of the Iceberg table's snapshot the blob was computed from
3329
pub(crate) sequence_number: i64,
34-
/// The uncompressed blob data
3530
pub(crate) data: Vec<u8>,
36-
/// Arbitrary meta-information about the blob
3731
pub(crate) properties: HashMap<String, String>,
3832
}
33+
34+
impl Blob {
35+
#[inline]
36+
/// See blob types: https://iceberg.apache.org/puffin-spec/#blob-types
37+
pub fn blob_type(&self) -> &str {
38+
&self.r#type
39+
}
40+
41+
#[inline]
42+
/// List of field IDs the blob was computed for; the order of items is used to compute sketches stored in the blob.
43+
pub fn fields(&self) -> &[i32] {
44+
&self.fields
45+
}
46+
47+
#[inline]
48+
/// ID of the Iceberg table's snapshot the blob was computed from
49+
pub fn snapshot_id(&self) -> i64 {
50+
self.snapshot_id
51+
}
52+
53+
#[inline]
54+
/// Sequence number of the Iceberg table's snapshot the blob was computed from
55+
pub fn sequence_number(&self) -> i64 {
56+
self.sequence_number
57+
}
58+
59+
#[inline]
60+
/// The uncompressed blob data
61+
pub fn data(&self) -> &[u8] {
62+
&self.data
63+
}
64+
65+
#[inline]
66+
/// Arbitrary meta-information about the blob
67+
pub fn properties(&self) -> &HashMap<String, String> {
68+
&self.properties
69+
}
70+
}

crates/iceberg/src/puffin/metadata.rs

Lines changed: 65 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,35 +26,77 @@ use crate::{Error, ErrorKind, Result};
2626

2727
/// Human-readable identification of the application writing the file, along with its version.
2828
/// Example: "Trino version 381"
29-
pub(crate) const CREATED_BY_PROPERTY: &str = "created-by";
29+
pub const CREATED_BY_PROPERTY: &str = "created-by";
3030

3131
/// Metadata about a blob.
3232
/// For more information, see: https://iceberg.apache.org/puffin-spec/#blobmetadata
3333
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
3434
#[serde(rename_all = "kebab-case")]
35-
pub(crate) struct BlobMetadata {
36-
/// See blob types: https://iceberg.apache.org/puffin-spec/#blob-types
35+
pub struct BlobMetadata {
3736
pub(crate) r#type: String,
38-
/// List of field IDs the blob was computed for; the order of items is used to compute sketches stored in the blob.
3937
pub(crate) fields: Vec<i32>,
40-
/// ID of the Iceberg table's snapshot the blob was computed from
4138
pub(crate) snapshot_id: i64,
42-
/// Sequence number of the Iceberg table's snapshot the blob was computed from
4339
pub(crate) sequence_number: i64,
44-
/// The offset in the file where the blob contents start
4540
pub(crate) offset: u64,
46-
/// The length of the blob stored in the file (after compression, if compressed)
4741
pub(crate) length: u64,
48-
/// The compression codec used to compress the data
4942
#[serde(skip_serializing_if = "CompressionCodec::is_none")]
5043
#[serde(default)]
5144
pub(crate) compression_codec: CompressionCodec,
52-
/// Arbitrary meta-information about the blob
5345
#[serde(skip_serializing_if = "HashMap::is_empty")]
5446
#[serde(default)]
5547
pub(crate) properties: HashMap<String, String>,
5648
}
5749

50+
impl BlobMetadata {
51+
#[inline]
52+
/// See blob types: https://iceberg.apache.org/puffin-spec/#blob-types
53+
pub fn blob_type(&self) -> &str {
54+
&self.r#type
55+
}
56+
57+
#[inline]
58+
/// List of field IDs the blob was computed for; the order of items is used to compute sketches stored in the blob.
59+
pub fn fields(&self) -> &[i32] {
60+
&self.fields
61+
}
62+
63+
#[inline]
64+
/// ID of the Iceberg table's snapshot the blob was computed from
65+
pub fn snapshot_id(&self) -> i64 {
66+
self.snapshot_id
67+
}
68+
69+
#[inline]
70+
/// Sequence number of the Iceberg table's snapshot the blob was computed from
71+
pub fn sequence_number(&self) -> i64 {
72+
self.sequence_number
73+
}
74+
75+
#[inline]
76+
/// The offset in the file where the blob contents start
77+
pub fn offset(&self) -> u64 {
78+
self.offset
79+
}
80+
81+
#[inline]
82+
/// The length of the blob stored in the file (after compression, if compressed)
83+
pub fn length(&self) -> u64 {
84+
self.length
85+
}
86+
87+
#[inline]
88+
/// The compression codec used to compress the data
89+
pub fn compression_codec(&self) -> CompressionCodec {
90+
self.compression_codec
91+
}
92+
93+
#[inline]
94+
/// Arbitrary meta-information about the blob
95+
pub fn properties(&self) -> &HashMap<String, String> {
96+
&self.properties
97+
}
98+
}
99+
58100
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
59101
pub(crate) enum Flag {
60102
FooterPayloadCompressed = 0,
@@ -91,10 +133,8 @@ impl Flag {
91133
/// Metadata about a puffin file.
92134
/// For more information, see: https://iceberg.apache.org/puffin-spec/#filemetadata
93135
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
94-
pub(crate) struct FileMetadata {
95-
/// Metadata about blobs in file
136+
pub struct FileMetadata {
96137
pub(crate) blobs: Vec<BlobMetadata>,
97-
/// Arbitrary meta-information, like writer identification/version.
98138
#[serde(skip_serializing_if = "HashMap::is_empty")]
99139
#[serde(default)]
100140
pub(crate) properties: HashMap<String, String>,
@@ -247,6 +287,18 @@ impl FileMetadata {
247287
FileMetadata::extract_footer_payload_as_str(&footer_bytes, footer_payload_length)?;
248288
FileMetadata::from_json_str(&footer_payload_str)
249289
}
290+
291+
#[inline]
292+
/// Metadata about blobs in file
293+
pub fn blobs(&self) -> &[BlobMetadata] {
294+
&self.blobs
295+
}
296+
297+
#[inline]
298+
/// Arbitrary meta-information, like writer identification/version.
299+
pub fn properties(&self) -> &HashMap<String, String> {
300+
&self.properties
301+
}
250302
}
251303

252304
#[cfg(test)]

crates/iceberg/src/puffin/mod.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,21 @@
1818
//! Iceberg Puffin implementation.
1919
2020
#![deny(missing_docs)]
21-
// Temporarily allowing this while crate is under active development
22-
#![allow(dead_code)]
2321

2422
mod blob;
23+
pub use blob::{Blob, APACHE_DATASKETCHES_THETA_V1};
24+
2525
mod compression;
26+
pub use compression::CompressionCodec;
27+
2628
mod metadata;
27-
#[cfg(feature = "tokio")]
29+
pub use metadata::{BlobMetadata, FileMetadata, CREATED_BY_PROPERTY};
30+
2831
mod reader;
32+
pub use reader::PuffinReader;
33+
2934
mod writer;
35+
pub use writer::PuffinWriter;
3036

3137
#[cfg(test)]
3238
mod test_utils;

crates/iceberg/src/puffin/reader.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,29 +23,29 @@ use crate::puffin::metadata::{BlobMetadata, FileMetadata};
2323
use crate::Result;
2424

2525
/// Puffin reader
26-
pub(crate) struct PuffinReader {
26+
pub struct PuffinReader {
2727
input_file: InputFile,
2828
file_metadata: OnceCell<FileMetadata>,
2929
}
3030

3131
impl PuffinReader {
3232
/// Returns a new Puffin reader
33-
pub(crate) fn new(input_file: InputFile) -> Self {
33+
pub fn new(input_file: InputFile) -> Self {
3434
Self {
3535
input_file,
3636
file_metadata: OnceCell::new(),
3737
}
3838
}
3939

4040
/// Returns file metadata
41-
pub(crate) async fn file_metadata(&self) -> Result<&FileMetadata> {
41+
pub async fn file_metadata(&self) -> Result<&FileMetadata> {
4242
self.file_metadata
4343
.get_or_try_init(|| FileMetadata::read(&self.input_file))
4444
.await
4545
}
4646

4747
/// Returns blob
48-
pub(crate) async fn blob(&self, blob_metadata: &BlobMetadata) -> Result<Blob> {
48+
pub async fn blob(&self, blob_metadata: &BlobMetadata) -> Result<Blob> {
4949
let file_read = self.input_file.reader().await?;
5050
let start = blob_metadata.offset;
5151
let end = start + blob_metadata.length;

crates/iceberg/src/puffin/writer.rs

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use crate::puffin::metadata::{BlobMetadata, FileMetadata, Flag};
2626
use crate::Result;
2727

2828
/// Puffin writer
29-
pub(crate) struct PuffinWriter {
29+
pub struct PuffinWriter {
3030
writer: Box<dyn FileWrite>,
3131
is_header_written: bool,
3232
num_bytes_written: u64,
@@ -38,7 +38,7 @@ pub(crate) struct PuffinWriter {
3838

3939
impl PuffinWriter {
4040
/// Returns a new Puffin writer
41-
pub(crate) async fn new(
41+
pub async fn new(
4242
output_file: &OutputFile,
4343
properties: HashMap<String, String>,
4444
compress_footer: bool,
@@ -63,11 +63,7 @@ impl PuffinWriter {
6363
}
6464

6565
/// Adds blob to Puffin file
66-
pub(crate) async fn add(
67-
&mut self,
68-
blob: Blob,
69-
compression_codec: CompressionCodec,
70-
) -> Result<()> {
66+
pub async fn add(&mut self, blob: Blob, compression_codec: CompressionCodec) -> Result<()> {
7167
self.write_header_once().await?;
7268

7369
let offset = self.num_bytes_written;
@@ -89,7 +85,7 @@ impl PuffinWriter {
8985
}
9086

9187
/// Finalizes the Puffin file
92-
pub(crate) async fn close(mut self) -> Result<()> {
88+
pub async fn close(mut self) -> Result<()> {
9389
self.write_header_once().await?;
9490
self.write_footer().await?;
9591
self.writer.close().await?;

0 commit comments

Comments
 (0)