Skip to content

Commit 55909a8

Browse files
authored
[SessionContext] - Add read_csv/read_parquet/read_avro functions to SessionContext (#57)
1 parent 0ac714a commit 55909a8

File tree

8 files changed

+230
-10
lines changed

8 files changed

+230
-10
lines changed

.github/workflows/test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ jobs:
101101
102102
- name: Run tests
103103
run: |
104+
git submodule update --init
104105
source venv/bin/activate
105106
maturin develop --locked
106107
RUST_BACKTRACE=1 pytest -v .

.gitmodules

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[submodule "testing"]
2+
path = testing
3+
url = https://github.com/apache/arrow-testing.git
4+
[submodule "parquet"]
5+
path = parquet
6+
url = https://github.com/apache/parquet-testing.git

Cargo.lock

Lines changed: 109 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ default = ["mimalloc"]
3434
tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
3535
rand = "0.7"
3636
pyo3 = { version = "~0.17.1", features = ["extension-module", "abi3", "abi3-py37"] }
37-
datafusion = { version = "^12.0.0", features = ["pyarrow"] }
37+
datafusion = { version = "^12.0.0", features = ["pyarrow", "avro"] }
3838
datafusion-expr = { version = "^12.0.0" }
3939
datafusion-common = { version = "^12.0.0", features = ["pyarrow"] }
4040
uuid = { version = "0.8", features = ["v4"] }

datafusion/tests/test_context.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,18 @@ def test_table_exist(ctx):
179179
ctx.register_dataset("t", dataset)
180180

181181
assert ctx.table_exist("t") is True
182+
183+
184+
def test_read_csv(ctx):
185+
csv_df = ctx.read_csv(path="testing/data/csv/aggregate_test_100.csv")
186+
csv_df.select(column("c1")).show()
187+
188+
189+
def test_read_parquet(ctx):
190+
csv_df = ctx.read_parquet(path="parquet/data/alltypes_plain.parquet")
191+
csv_df.show()
192+
193+
194+
def test_read_avro(ctx):
195+
csv_df = ctx.read_avro(path="testing/data/avro/alltypes_plain.avro")
196+
csv_df.show()

parquet

Submodule parquet added at e13af11

src/context.rs

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use datafusion::arrow::record_batch::RecordBatch;
2828
use datafusion::datasource::datasource::TableProvider;
2929
use datafusion::datasource::MemTable;
3030
use datafusion::execution::context::{SessionConfig, SessionContext};
31-
use datafusion::prelude::{CsvReadOptions, ParquetReadOptions};
31+
use datafusion::prelude::{AvroReadOptions, CsvReadOptions, ParquetReadOptions};
3232

3333
use crate::catalog::{PyCatalog, PyTable};
3434
use crate::dataframe::PyDataFrame;
@@ -264,4 +264,99 @@ impl PySessionContext {
264264
fn session_id(&self) -> PyResult<String> {
265265
Ok(self.ctx.session_id())
266266
}
267+
268+
#[allow(clippy::too_many_arguments)]
269+
#[args(
270+
schema = "None",
271+
has_header = "true",
272+
delimiter = "\",\"",
273+
schema_infer_max_records = "1000",
274+
file_extension = "\".csv\"",
275+
table_partition_cols = "vec![]"
276+
)]
277+
fn read_csv(
278+
&self,
279+
path: PathBuf,
280+
schema: Option<Schema>,
281+
has_header: bool,
282+
delimiter: &str,
283+
schema_infer_max_records: usize,
284+
file_extension: &str,
285+
table_partition_cols: Vec<String>,
286+
py: Python,
287+
) -> PyResult<PyDataFrame> {
288+
let path = path
289+
.to_str()
290+
.ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?;
291+
292+
let delimiter = delimiter.as_bytes();
293+
if delimiter.len() != 1 {
294+
return Err(PyValueError::new_err(
295+
"Delimiter must be a single character",
296+
));
297+
};
298+
299+
let mut options = CsvReadOptions::new()
300+
.has_header(has_header)
301+
.delimiter(delimiter[0])
302+
.schema_infer_max_records(schema_infer_max_records)
303+
.file_extension(file_extension)
304+
.table_partition_cols(table_partition_cols);
305+
options.schema = schema.as_ref();
306+
307+
let result = self.ctx.read_csv(path, options);
308+
let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?);
309+
310+
Ok(df)
311+
}
312+
313+
#[allow(clippy::too_many_arguments)]
314+
#[args(
315+
parquet_pruning = "true",
316+
file_extension = "\".parquet\"",
317+
table_partition_cols = "vec![]",
318+
skip_metadata = "true"
319+
)]
320+
fn read_parquet(
321+
&self,
322+
path: &str,
323+
table_partition_cols: Vec<String>,
324+
parquet_pruning: bool,
325+
file_extension: &str,
326+
skip_metadata: bool,
327+
py: Python,
328+
) -> PyResult<PyDataFrame> {
329+
let mut options = ParquetReadOptions::default()
330+
.table_partition_cols(table_partition_cols)
331+
.parquet_pruning(parquet_pruning)
332+
.skip_metadata(skip_metadata);
333+
options.file_extension = file_extension;
334+
335+
let result = self.ctx.read_parquet(path, options);
336+
let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?);
337+
Ok(df)
338+
}
339+
340+
#[allow(clippy::too_many_arguments)]
341+
#[args(
342+
schema = "None",
343+
file_extension = "\".avro\"",
344+
table_partition_cols = "vec![]"
345+
)]
346+
fn read_avro(
347+
&self,
348+
path: &str,
349+
schema: Option<Schema>,
350+
table_partition_cols: Vec<String>,
351+
file_extension: &str,
352+
py: Python,
353+
) -> PyResult<PyDataFrame> {
354+
let mut options = AvroReadOptions::default().table_partition_cols(table_partition_cols);
355+
options.file_extension = file_extension;
356+
options.schema = schema.map(Arc::new);
357+
358+
let result = self.ctx.read_avro(path, options);
359+
let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?);
360+
Ok(df)
361+
}
267362
}

testing

Submodule testing added at 5bab2f2

0 commit comments

Comments
 (0)