sfu-db · dovahcrow · Apr 18, 2021 · Apr 16, 2021 · Apr 16, 2021 · Apr 16, 2021
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Justfile b/Justfile
@@ -21,8 +21,6 @@ test-python: setup-python
     cd connectorx-python && poetry run pytest connectorx/tests -v -s
 
 seed-db:
-    psql $POSTGRES_URL -c "DROP TABLE IF EXISTS test_table;"
-    psql $POSTGRES_URL -c "DROP TABLE IF EXISTS test_str;"
     psql $POSTGRES_URL -f scripts/postgres.sql
 
 # benches 

diff --git a/connectorx-python/Cargo.toml b/connectorx-python/Cargo.toml
@@ -22,9 +22,11 @@ numpy = "0.13"
 pyo3 = {version = "0.13", default-features = false, features = ["macros"]}
 pyo3-built = "0.4"
 rust_decimal = {version = "1", features = ["db-postgres"]}
+serde_json = "1"
 sqlparser = "0.8.0"
 thiserror = "1"
 tokio = {version = "1", features = ["rt-multi-thread", "io-util"]}
+uuid = "0.8"
 
 [build-dependencies]
 built = {version = "0.4", features = ["chrono"]}

diff --git a/connectorx-python/connectorx/tests/test_read_sql.py b/connectorx-python/connectorx/tests/test_read_sql.py
@@ -223,3 +223,95 @@ def test_read_sql_on_utf8(postgres_url: str) -> None:
         },
     )
     assert_frame_equal(df, expected, check_names=True)
+
+
+def test_types_binary(postgres_url: str) -> None:
+    query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum FROM test_types"
+    df = read_sql(postgres_url, query)
+    expected = pd.DataFrame(
+        index=range(4),
+        data={
+            "test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"),
+            "test_char": pd.Series(["a", "b", "c", "d"], dtype="object"),
+            "test_uuid": pd.Series(
+                [
+                    "86b494cc-96b2-11eb-9298-3e22fbb9fe9d",
+                    "86b49b84-96b2-11eb-9298-3e22fbb9fe9d",
+                    "86b49c42-96b2-11eb-9298-3e22fbb9fe9d",
+                    "86b49cce-96b2-11eb-9298-3e22fbb9fe9d"
+                ], dtype="object"
+            ),
+            "test_time": pd.Series(["08:12:40", "10:03:00", "23:00:10", "18:30:00"], dtype="object"),
+            "test_json": pd.Series(
+                [
+                    '{"customer":"John Doe","items":{"product":"Beer","qty":6}}',
+                    '{"customer":"Lily Bush","items":{"product":"Diaper","qty":24}}',
+                    '{"customer":"Josh William","items":{"product":"Toy Car","qty":1}}',
+                    '{"customer":"Mary Clark","items":{"product":"Toy Train","qty":2}}',
+                ], dtype="object"
+            ),
+            "test_jsonb": pd.Series(
+                [
+                    '{"qty":6,"product":"Beer"}',
+                    '{"qty":24,"product":"Diaper"}',
+                    '{"qty":1,"product":"Toy Car"}',
+                    '{"qty":2,"product":"Toy Train"}',
+                ], dtype="object"
+            ),
+            "test_bytea": pd.Series(
+                [
+                    b'test',
+                    b'\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5',
+                    b'123bhaf4',
+                    b'\xf0\x9f\x98\x9c'
+                ], dtype="object"),
+            "test_enum": pd.Series(['happy', 'very happy', 'ecstatic', 'ecstatic'], dtype="object")
+        },
+    )
+    assert_frame_equal(df, expected, check_names=True)
+
+
+def test_types_csv(postgres_url: str) -> None:
+    query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text FROM test_types"
+    df = read_sql(postgres_url, query, protocol="csv")
+    expected = pd.DataFrame(
+        index=range(4),
+        data={
+            "test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"),
+            "test_char": pd.Series(["a", "b", "c", "d"], dtype="object"),
+            "test_uuid": pd.Series(
+                [
+                    "86b494cc-96b2-11eb-9298-3e22fbb9fe9d",
+                    "86b49b84-96b2-11eb-9298-3e22fbb9fe9d",
+                    "86b49c42-96b2-11eb-9298-3e22fbb9fe9d",
+                    "86b49cce-96b2-11eb-9298-3e22fbb9fe9d"
+                ], dtype="object"
+            ),
+            "test_time": pd.Series(["08:12:40", "10:03:00", "23:00:10", "18:30:00"], dtype="object"),
+            "test_json": pd.Series(
+                [
+                    '{"customer":"John Doe","items":{"product":"Beer","qty":6}}',
+                    '{"customer":"Lily Bush","items":{"product":"Diaper","qty":24}}',
+                    '{"customer":"Josh William","items":{"product":"Toy Car","qty":1}}',
+                    '{"customer":"Mary Clark","items":{"product":"Toy Train","qty":2}}',
+                ], dtype="object"
+            ),
+            "test_jsonb": pd.Series(
+                [
+                    '{"qty":6,"product":"Beer"}',
+                    '{"qty":24,"product":"Diaper"}',
+                    '{"qty":1,"product":"Toy Car"}',
+                    '{"qty":2,"product":"Toy Train"}',
+                ], dtype="object"
+            ),
+            "test_bytea": pd.Series(
+                [
+                    b'test',
+                    b'\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5',
+                    b'123bhaf4',
+                    b'\xf0\x9f\x98\x9c'
+                ], dtype="object"),
+            "test_enum": pd.Series(['happy', 'very happy', 'ecstatic', 'ecstatic'], dtype="object")
+        },
+    )
+    assert_frame_equal(df, expected, check_names=True)
diff --git a/connectorx-python/src/pandas/destination.rs b/connectorx-python/src/pandas/destination.rs
@@ -1,6 +1,6 @@
 use super::pandas_columns::{
-    BooleanBlock, DateTimeBlock, Float64Block, HasPandasColumn, Int64Block, PandasColumn,
-    PandasColumnObject, StringBlock,
+    BooleanBlock, BytesBlock, DateTimeBlock, Float64Block, HasPandasColumn, Int64Block,
+    PandasColumn, PandasColumnObject, StringBlock,
 };
 use super::types::{PandasDType, PandasTypeSystem};
 use anyhow::anyhow;
@@ -156,7 +156,9 @@ impl<'a> Destination for PandasDestination<'a> {
                                 .collect()
                         }
                     }
-                    PandasTypeSystem::String(_) => {
+                    PandasTypeSystem::String(_)
+                    | PandasTypeSystem::Str(_)
+                    | PandasTypeSystem::Char(_) => {
                         let block = StringBlock::extract(buf).map_err(|e| anyhow!(e))?;
                         let cols = block.split()?;
                         for (&cid, col) in cids.iter().zip_eq(cols) {
@@ -167,6 +169,17 @@ impl<'a> Destination for PandasDestination<'a> {
                                 .collect()
                         }
                     }
+                    PandasTypeSystem::Bytes(_) => {
+                        let block = BytesBlock::extract(buf).map_err(|e| anyhow!(e))?;
+                        let cols = block.split()?;
+                        for (&cid, col) in cids.iter().zip_eq(cols) {
+                            partitioned_columns[cid] = col
+                                .partition(&counts)
+                                .into_iter()
+                                .map(|c| Box::new(c) as _)
+                                .collect()
+                        }
+                    }
                     PandasTypeSystem::DateTime(_) => {
                         let block = DateTimeBlock::extract(buf).map_err(|e| anyhow!(e))?;
                         let cols = block.split()?;

diff --git a/connectorx-python/src/pandas/pandas_columns/bytes.rs b/connectorx-python/src/pandas/pandas_columns/bytes.rs
@@ -0,0 +1,187 @@
+use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject};
+use anyhow::anyhow;
+use connectorx::ConnectorAgentError;
+use fehler::throws;
+use ndarray::{ArrayViewMut2, Axis, Ix2};
+use numpy::{npyffi::NPY_TYPES, Element, PyArray, PyArrayDescr};
+use pyo3::{FromPyObject, Py, PyAny, PyResult, Python};
+use std::any::TypeId;
+use std::sync::{Arc, Mutex};
+
+#[derive(Clone)]
+#[repr(transparent)]
+pub struct PyBytes(Py<pyo3::types::PyBytes>);
+
+// In order to put it into a numpy array
+impl Element for PyBytes {
+    const DATA_TYPE: numpy::DataType = numpy::DataType::Object;
+    fn is_same_type(dtype: &PyArrayDescr) -> bool {
+        unsafe { *dtype.as_dtype_ptr() }.type_num == NPY_TYPES::NPY_OBJECT as i32
+    }
+}
+
+pub struct BytesBlock<'a> {
+    data: ArrayViewMut2<'a, PyBytes>,
+    mutex: Arc<Mutex<()>>,
+    buf_size_mb: usize,
+}
+
+impl<'a> FromPyObject<'a> for BytesBlock<'a> {
+    fn extract(ob: &'a PyAny) -> PyResult<Self> {
+        check_dtype(ob, "object")?;
+        let array = ob.downcast::<PyArray<PyBytes, Ix2>>()?;
+        let data = unsafe { array.as_array_mut() };
+        Ok(BytesBlock {
+            data,
+            mutex: Arc::new(Mutex::new(())), // allocate the lock here since only BytesBlock needs to aquire the GIL for now
+            buf_size_mb: 16,                 // in MB
+        })
+    }
+}
+
+impl<'a> BytesBlock<'a> {
+    #[throws(ConnectorAgentError)]
+    pub fn split(self) -> Vec<BytesColumn<'a>> {
+        let mut ret = vec![];
+        let mut view = self.data;
+
+        let nrows = view.ncols();
+        while view.nrows() > 0 {
+            let (col, rest) = view.split_at(Axis(0), 1);
+            view = rest;
+            ret.push(BytesColumn {
+                data: col
+                    .into_shape(nrows)?
+                    .into_slice()
+                    .ok_or_else(|| anyhow!("get None for splitted String data"))?,
+                next_write: 0,
+                bytes_lengths: vec![],
+                bytes_buf: Vec::with_capacity(self.buf_size_mb * 2 << 20 * 11 / 10), // allocate a little bit more memory to avoid Vec growth
+                buf_size: self.buf_size_mb * 2 << 20,
+                mutex: self.mutex.clone(),
+            })
+        }
+        ret
+    }
+}
+
+pub struct BytesColumn<'a> {
+    data: &'a mut [PyBytes],
+    next_write: usize,
+    bytes_buf: Vec<u8>,
+    bytes_lengths: Vec<usize>,
+    buf_size: usize,
+    mutex: Arc<Mutex<()>>,
+}
+
+impl<'a> PandasColumnObject for BytesColumn<'a> {
+    fn typecheck(&self, id: TypeId) -> bool {
+        id == TypeId::of::<&'static [u8]>() || id == TypeId::of::<Option<&'static [u8]>>()
+    }
+    fn len(&self) -> usize {
+        self.data.len()
+    }
+    fn typename(&self) -> &'static str {
+        std::any::type_name::<&'static [u8]>()
+    }
+    #[throws(ConnectorAgentError)]
+    fn finalize(&mut self) {
+        self.flush()?;
+    }
+}
+
+impl<'a> PandasColumn<Vec<u8>> for BytesColumn<'a> {
+    #[throws(ConnectorAgentError)]
+    fn write(&mut self, val: Vec<u8>) {
+        self.bytes_lengths.push(val.len());
+        self.bytes_buf.extend_from_slice(&val[..]);
+        self.try_flush()?;
+    }
+}
+
+impl<'a> PandasColumn<Option<Vec<u8>>> for BytesColumn<'a> {
+    #[throws(ConnectorAgentError)]
+    fn write(&mut self, val: Option<Vec<u8>>) {
+        match val {
+            Some(b) => {
+                self.bytes_lengths.push(b.len());
+                self.bytes_buf.extend_from_slice(&b[..]);
+                self.try_flush()?;
+            }
+            None => {
+                self.bytes_lengths.push(0);
+            }
+        }
+    }
+}
+
+impl HasPandasColumn for Vec<u8> {
+    type PandasColumn<'a> = BytesColumn<'a>;
+}
+
+impl HasPandasColumn for Option<Vec<u8>> {
+    type PandasColumn<'a> = BytesColumn<'a>;
+}
+
+impl<'a> BytesColumn<'a> {
+    pub fn partition(self, counts: &[usize]) -> Vec<BytesColumn<'a>> {
+        let mut partitions = vec![];
+        let mut data = self.data;
+
+        for &c in counts {
+            let (splitted_data, rest) = data.split_at_mut(c);
+            data = rest;
+
+            partitions.push(BytesColumn {
+                data: splitted_data,
+                next_write: 0,
+                bytes_lengths: vec![],
+                bytes_buf: Vec::with_capacity(self.buf_size),
+                buf_size: self.buf_size,
+                mutex: self.mutex.clone(),
+            });
+        }
+
+        partitions
+    }
+
+    #[throws(ConnectorAgentError)]
+    pub fn flush(&mut self) {
+        let nstrings = self.bytes_lengths.len();
+
+        if nstrings > 0 {
+            let py = unsafe { Python::assume_gil_acquired() };
+
+            {
+                // allocation in python is not thread safe
+                let _guard = self
+                    .mutex
+                    .lock()
+                    .map_err(|e| anyhow!("mutex poisoned {}", e))?;
+                let mut start = 0;
+                for (i, &len) in self.bytes_lengths.iter().enumerate() {
+                    let end = start + len;
+                    if len != 0 {
+                        unsafe {
+                            // allocate and write in the same time
+                            *self.data.get_unchecked_mut(self.next_write + i) = PyBytes(
+                                pyo3::types::PyBytes::new(py, &self.bytes_buf[start..end]).into(),
+                            );
+                        };
+                    }
+                    start = end;
+                }
+            }
+
+            self.bytes_buf.truncate(0);
+            self.next_write += nstrings;
+        }
+    }
+
+    #[throws(ConnectorAgentError)]
+    pub fn try_flush(&mut self) {
+        if self.bytes_buf.len() >= self.buf_size {
+            self.flush()?;
+        }
+    }
+}
diff --git a/connectorx-python/src/pandas/pandas_columns/mod.rs b/connectorx-python/src/pandas/pandas_columns/mod.rs
@@ -1,10 +1,12 @@
 mod boolean;
+mod bytes;
 mod datetime;
 mod float64;
 mod int64;
 mod string;
 // TODO: use macro for integers
 
+pub use crate::pandas::pandas_columns::bytes::{BytesBlock, BytesColumn};
 pub use boolean::{BooleanBlock, BooleanColumn};
 use connectorx::Result;
 pub use datetime::{DateTimeBlock, DateTimeColumn};