[SessionContext] - Add read_csv/read_parquet/read_avro functions to SessionContext (#57)

francis-du · web-flow · commit 55909a8d451f · 2022-10-13T06:37:08.000-06:00
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -101,6 +101,7 @@ jobs:
 
       - name: Run tests
         run: |
+          git submodule update --init
           source venv/bin/activate
           maturin develop --locked
           RUST_BACKTRACE=1 pytest -v .
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "testing"]
+	path = testing
+	url = https://github.com/apache/arrow-testing.git
+[submodule "parquet"]
+	path = parquet
+	url = https://github.com/apache/parquet-testing.git
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -34,7 +34,7 @@ default = ["mimalloc"]
 tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
 rand = "0.7"
 pyo3 = { version = "~0.17.1", features = ["extension-module", "abi3", "abi3-py37"] }
-datafusion = { version = "^12.0.0", features = ["pyarrow"] }
+datafusion = { version = "^12.0.0", features = ["pyarrow", "avro"] }
 datafusion-expr = { version = "^12.0.0" }
 datafusion-common = { version = "^12.0.0", features = ["pyarrow"] }
 uuid = { version = "0.8", features = ["v4"] }
diff --git a/datafusion/tests/test_context.py b/datafusion/tests/test_context.py
@@ -179,3 +179,18 @@ def test_table_exist(ctx):
     ctx.register_dataset("t", dataset)
 
     assert ctx.table_exist("t") is True
+
+
+def test_read_csv(ctx):
+    csv_df = ctx.read_csv(path="testing/data/csv/aggregate_test_100.csv")
+    csv_df.select(column("c1")).show()
+
+
+def test_read_parquet(ctx):
+    csv_df = ctx.read_parquet(path="parquet/data/alltypes_plain.parquet")
+    csv_df.show()
+
+
+def test_read_avro(ctx):
+    csv_df = ctx.read_avro(path="testing/data/avro/alltypes_plain.avro")
+    csv_df.show()
diff --git a/parquet b/parquet
@@ -0,0 +1 @@
+Subproject commit e13af117de7c4f0a4d9908ae3827b3ab119868f3
diff --git a/src/context.rs b/src/context.rs
@@ -28,7 +28,7 @@ use datafusion::arrow::record_batch::RecordBatch;
 use datafusion::datasource::datasource::TableProvider;
 use datafusion::datasource::MemTable;
 use datafusion::execution::context::{SessionConfig, SessionContext};
-use datafusion::prelude::{CsvReadOptions, ParquetReadOptions};
+use datafusion::prelude::{AvroReadOptions, CsvReadOptions, ParquetReadOptions};
 
 use crate::catalog::{PyCatalog, PyTable};
 use crate::dataframe::PyDataFrame;
@@ -264,4 +264,99 @@ impl PySessionContext {
     fn session_id(&self) -> PyResult<String> {
         Ok(self.ctx.session_id())
     }
+
+    #[allow(clippy::too_many_arguments)]
+    #[args(
+        schema = "None",
+        has_header = "true",
+        delimiter = "\",\"",
+        schema_infer_max_records = "1000",
+        file_extension = "\".csv\"",
+        table_partition_cols = "vec![]"
+    )]
+    fn read_csv(
+        &self,
+        path: PathBuf,
+        schema: Option<Schema>,
+        has_header: bool,
+        delimiter: &str,
+        schema_infer_max_records: usize,
+        file_extension: &str,
+        table_partition_cols: Vec<String>,
+        py: Python,
+    ) -> PyResult<PyDataFrame> {
+        let path = path
+            .to_str()
+            .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?;
+
+        let delimiter = delimiter.as_bytes();
+        if delimiter.len() != 1 {
+            return Err(PyValueError::new_err(
+                "Delimiter must be a single character",
+            ));
+        };
+
+        let mut options = CsvReadOptions::new()
+            .has_header(has_header)
+            .delimiter(delimiter[0])
+            .schema_infer_max_records(schema_infer_max_records)
+            .file_extension(file_extension)
+            .table_partition_cols(table_partition_cols);
+        options.schema = schema.as_ref();
+
+        let result = self.ctx.read_csv(path, options);
+        let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?);
+
+        Ok(df)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[args(
+        parquet_pruning = "true",
+        file_extension = "\".parquet\"",
+        table_partition_cols = "vec![]",
+        skip_metadata = "true"
+    )]
+    fn read_parquet(
+        &self,
+        path: &str,
+        table_partition_cols: Vec<String>,
+        parquet_pruning: bool,
+        file_extension: &str,
+        skip_metadata: bool,
+        py: Python,
+    ) -> PyResult<PyDataFrame> {
+        let mut options = ParquetReadOptions::default()
+            .table_partition_cols(table_partition_cols)
+            .parquet_pruning(parquet_pruning)
+            .skip_metadata(skip_metadata);
+        options.file_extension = file_extension;
+
+        let result = self.ctx.read_parquet(path, options);
+        let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?);
+        Ok(df)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[args(
+        schema = "None",
+        file_extension = "\".avro\"",
+        table_partition_cols = "vec![]"
+    )]
+    fn read_avro(
+        &self,
+        path: &str,
+        schema: Option<Schema>,
+        table_partition_cols: Vec<String>,
+        file_extension: &str,
+        py: Python,
+    ) -> PyResult<PyDataFrame> {
+        let mut options = AvroReadOptions::default().table_partition_cols(table_partition_cols);
+        options.file_extension = file_extension;
+        options.schema = schema.map(Arc::new);
+
+        let result = self.ctx.read_avro(path, options);
+        let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?);
+        Ok(df)
+    }
 }
diff --git a/testing b/testing
@@ -0,0 +1 @@
+Subproject commit 5bab2f264a23f5af68f69ea93d24ef1e8e77fc88