Skip to content

Postgres add support for more types #71

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ test-python: setup-python
cd connectorx-python && poetry run pytest connectorx/tests -v -s

seed-db:
psql $POSTGRES_URL -c "DROP TABLE IF EXISTS test_table;"
psql $POSTGRES_URL -c "DROP TABLE IF EXISTS test_str;"
psql $POSTGRES_URL -f scripts/postgres.sql

# benches
Expand Down
2 changes: 2 additions & 0 deletions connectorx-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ numpy = "0.13"
pyo3 = {version = "0.13", default-features = false, features = ["macros"]}
pyo3-built = "0.4"
rust_decimal = {version = "1", features = ["db-postgres"]}
serde_json = "1"
sqlparser = "0.8.0"
thiserror = "1"
tokio = {version = "1", features = ["rt-multi-thread", "io-util"]}
uuid = "0.8"

[build-dependencies]
built = {version = "0.4", features = ["chrono"]}
Expand Down
92 changes: 92 additions & 0 deletions connectorx-python/connectorx/tests/test_read_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,95 @@ def test_read_sql_on_utf8(postgres_url: str) -> None:
},
)
assert_frame_equal(df, expected, check_names=True)


def test_types_binary(postgres_url: str) -> None:
query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum FROM test_types"
df = read_sql(postgres_url, query)
expected = pd.DataFrame(
index=range(4),
data={
"test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"),
"test_char": pd.Series(["a", "b", "c", "d"], dtype="object"),
"test_uuid": pd.Series(
[
"86b494cc-96b2-11eb-9298-3e22fbb9fe9d",
"86b49b84-96b2-11eb-9298-3e22fbb9fe9d",
"86b49c42-96b2-11eb-9298-3e22fbb9fe9d",
"86b49cce-96b2-11eb-9298-3e22fbb9fe9d"
], dtype="object"
),
"test_time": pd.Series(["08:12:40", "10:03:00", "23:00:10", "18:30:00"], dtype="object"),
"test_json": pd.Series(
[
'{"customer":"John Doe","items":{"product":"Beer","qty":6}}',
'{"customer":"Lily Bush","items":{"product":"Diaper","qty":24}}',
'{"customer":"Josh William","items":{"product":"Toy Car","qty":1}}',
'{"customer":"Mary Clark","items":{"product":"Toy Train","qty":2}}',
], dtype="object"
),
"test_jsonb": pd.Series(
[
'{"qty":6,"product":"Beer"}',
'{"qty":24,"product":"Diaper"}',
'{"qty":1,"product":"Toy Car"}',
'{"qty":2,"product":"Toy Train"}',
], dtype="object"
),
"test_bytea": pd.Series(
[
b'test',
b'\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5',
b'123bhaf4',
b'\xf0\x9f\x98\x9c'
], dtype="object"),
"test_enum": pd.Series(['happy', 'very happy', 'ecstatic', 'ecstatic'], dtype="object")
},
)
assert_frame_equal(df, expected, check_names=True)


def test_types_csv(postgres_url: str) -> None:
query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text FROM test_types"
df = read_sql(postgres_url, query, protocol="csv")
expected = pd.DataFrame(
index=range(4),
data={
"test_int16": pd.Series([0, 1, 2, 3], dtype="Int64"),
"test_char": pd.Series(["a", "b", "c", "d"], dtype="object"),
"test_uuid": pd.Series(
[
"86b494cc-96b2-11eb-9298-3e22fbb9fe9d",
"86b49b84-96b2-11eb-9298-3e22fbb9fe9d",
"86b49c42-96b2-11eb-9298-3e22fbb9fe9d",
"86b49cce-96b2-11eb-9298-3e22fbb9fe9d"
], dtype="object"
),
"test_time": pd.Series(["08:12:40", "10:03:00", "23:00:10", "18:30:00"], dtype="object"),
"test_json": pd.Series(
[
'{"customer":"John Doe","items":{"product":"Beer","qty":6}}',
'{"customer":"Lily Bush","items":{"product":"Diaper","qty":24}}',
'{"customer":"Josh William","items":{"product":"Toy Car","qty":1}}',
'{"customer":"Mary Clark","items":{"product":"Toy Train","qty":2}}',
], dtype="object"
),
"test_jsonb": pd.Series(
[
'{"qty":6,"product":"Beer"}',
'{"qty":24,"product":"Diaper"}',
'{"qty":1,"product":"Toy Car"}',
'{"qty":2,"product":"Toy Train"}',
], dtype="object"
),
"test_bytea": pd.Series(
[
b'test',
b'\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xcc\x81\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5',
b'123bhaf4',
b'\xf0\x9f\x98\x9c'
], dtype="object"),
"test_enum": pd.Series(['happy', 'very happy', 'ecstatic', 'ecstatic'], dtype="object")
},
)
assert_frame_equal(df, expected, check_names=True)
19 changes: 16 additions & 3 deletions connectorx-python/src/pandas/destination.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::pandas_columns::{
BooleanBlock, DateTimeBlock, Float64Block, HasPandasColumn, Int64Block, PandasColumn,
PandasColumnObject, StringBlock,
BooleanBlock, BytesBlock, DateTimeBlock, Float64Block, HasPandasColumn, Int64Block,
PandasColumn, PandasColumnObject, StringBlock,
};
use super::types::{PandasDType, PandasTypeSystem};
use anyhow::anyhow;
Expand Down Expand Up @@ -156,7 +156,9 @@ impl<'a> Destination for PandasDestination<'a> {
.collect()
}
}
PandasTypeSystem::String(_) => {
PandasTypeSystem::String(_)
| PandasTypeSystem::Str(_)
| PandasTypeSystem::Char(_) => {
let block = StringBlock::extract(buf).map_err(|e| anyhow!(e))?;
let cols = block.split()?;
for (&cid, col) in cids.iter().zip_eq(cols) {
Expand All @@ -167,6 +169,17 @@ impl<'a> Destination for PandasDestination<'a> {
.collect()
}
}
PandasTypeSystem::Bytes(_) => {
let block = BytesBlock::extract(buf).map_err(|e| anyhow!(e))?;
let cols = block.split()?;
for (&cid, col) in cids.iter().zip_eq(cols) {
partitioned_columns[cid] = col
.partition(&counts)
.into_iter()
.map(|c| Box::new(c) as _)
.collect()
}
}
PandasTypeSystem::DateTime(_) => {
let block = DateTimeBlock::extract(buf).map_err(|e| anyhow!(e))?;
let cols = block.split()?;
Expand Down
187 changes: 187 additions & 0 deletions connectorx-python/src/pandas/pandas_columns/bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject};
use anyhow::anyhow;
use connectorx::ConnectorAgentError;
use fehler::throws;
use ndarray::{ArrayViewMut2, Axis, Ix2};
use numpy::{npyffi::NPY_TYPES, Element, PyArray, PyArrayDescr};
use pyo3::{FromPyObject, Py, PyAny, PyResult, Python};
use std::any::TypeId;
use std::sync::{Arc, Mutex};

#[derive(Clone)]
#[repr(transparent)]
pub struct PyBytes(Py<pyo3::types::PyBytes>);

// In order to put it into a numpy array
impl Element for PyBytes {
const DATA_TYPE: numpy::DataType = numpy::DataType::Object;
fn is_same_type(dtype: &PyArrayDescr) -> bool {
unsafe { *dtype.as_dtype_ptr() }.type_num == NPY_TYPES::NPY_OBJECT as i32
}
}

pub struct BytesBlock<'a> {
data: ArrayViewMut2<'a, PyBytes>,
mutex: Arc<Mutex<()>>,
buf_size_mb: usize,
}

impl<'a> FromPyObject<'a> for BytesBlock<'a> {
fn extract(ob: &'a PyAny) -> PyResult<Self> {
check_dtype(ob, "object")?;
let array = ob.downcast::<PyArray<PyBytes, Ix2>>()?;
let data = unsafe { array.as_array_mut() };
Ok(BytesBlock {
data,
mutex: Arc::new(Mutex::new(())), // allocate the lock here since only BytesBlock needs to aquire the GIL for now
buf_size_mb: 16, // in MB
})
}
}

impl<'a> BytesBlock<'a> {
#[throws(ConnectorAgentError)]
pub fn split(self) -> Vec<BytesColumn<'a>> {
let mut ret = vec![];
let mut view = self.data;

let nrows = view.ncols();
while view.nrows() > 0 {
let (col, rest) = view.split_at(Axis(0), 1);
view = rest;
ret.push(BytesColumn {
data: col
.into_shape(nrows)?
.into_slice()
.ok_or_else(|| anyhow!("get None for splitted String data"))?,
next_write: 0,
bytes_lengths: vec![],
bytes_buf: Vec::with_capacity(self.buf_size_mb * 2 << 20 * 11 / 10), // allocate a little bit more memory to avoid Vec growth
buf_size: self.buf_size_mb * 2 << 20,
mutex: self.mutex.clone(),
})
}
ret
}
}

pub struct BytesColumn<'a> {
data: &'a mut [PyBytes],
next_write: usize,
bytes_buf: Vec<u8>,
bytes_lengths: Vec<usize>,
buf_size: usize,
mutex: Arc<Mutex<()>>,
}

impl<'a> PandasColumnObject for BytesColumn<'a> {
fn typecheck(&self, id: TypeId) -> bool {
id == TypeId::of::<&'static [u8]>() || id == TypeId::of::<Option<&'static [u8]>>()
}
fn len(&self) -> usize {
self.data.len()
}
fn typename(&self) -> &'static str {
std::any::type_name::<&'static [u8]>()
}
#[throws(ConnectorAgentError)]
fn finalize(&mut self) {
self.flush()?;
}
}

impl<'a> PandasColumn<Vec<u8>> for BytesColumn<'a> {
#[throws(ConnectorAgentError)]
fn write(&mut self, val: Vec<u8>) {
self.bytes_lengths.push(val.len());
self.bytes_buf.extend_from_slice(&val[..]);
self.try_flush()?;
}
}

impl<'a> PandasColumn<Option<Vec<u8>>> for BytesColumn<'a> {
#[throws(ConnectorAgentError)]
fn write(&mut self, val: Option<Vec<u8>>) {
match val {
Some(b) => {
self.bytes_lengths.push(b.len());
self.bytes_buf.extend_from_slice(&b[..]);
self.try_flush()?;
}
None => {
self.bytes_lengths.push(0);
}
}
}
}

impl HasPandasColumn for Vec<u8> {
type PandasColumn<'a> = BytesColumn<'a>;
}

impl HasPandasColumn for Option<Vec<u8>> {
type PandasColumn<'a> = BytesColumn<'a>;
}

impl<'a> BytesColumn<'a> {
pub fn partition(self, counts: &[usize]) -> Vec<BytesColumn<'a>> {
let mut partitions = vec![];
let mut data = self.data;

for &c in counts {
let (splitted_data, rest) = data.split_at_mut(c);
data = rest;

partitions.push(BytesColumn {
data: splitted_data,
next_write: 0,
bytes_lengths: vec![],
bytes_buf: Vec::with_capacity(self.buf_size),
buf_size: self.buf_size,
mutex: self.mutex.clone(),
});
}

partitions
}

#[throws(ConnectorAgentError)]
pub fn flush(&mut self) {
let nstrings = self.bytes_lengths.len();

if nstrings > 0 {
let py = unsafe { Python::assume_gil_acquired() };

{
// allocation in python is not thread safe
let _guard = self
.mutex
.lock()
.map_err(|e| anyhow!("mutex poisoned {}", e))?;
let mut start = 0;
for (i, &len) in self.bytes_lengths.iter().enumerate() {
let end = start + len;
if len != 0 {
unsafe {
// allocate and write in the same time
*self.data.get_unchecked_mut(self.next_write + i) = PyBytes(
pyo3::types::PyBytes::new(py, &self.bytes_buf[start..end]).into(),
);
};
}
start = end;
}
}

self.bytes_buf.truncate(0);
self.next_write += nstrings;
}
}

#[throws(ConnectorAgentError)]
pub fn try_flush(&mut self) {
if self.bytes_buf.len() >= self.buf_size {
self.flush()?;
}
}
}
2 changes: 2 additions & 0 deletions connectorx-python/src/pandas/pandas_columns/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
mod boolean;
mod bytes;
mod datetime;
mod float64;
mod int64;
mod string;
// TODO: use macro for integers

pub use crate::pandas::pandas_columns::bytes::{BytesBlock, BytesColumn};
pub use boolean::{BooleanBlock, BooleanColumn};
use connectorx::Result;
pub use datetime::{DateTimeBlock, DateTimeColumn};
Expand Down
Loading