Skip to content

Commit ef9f358

Browse files
nathanjmcdougallmachowisabelizimm
authored
Move to adaptor backend (#298)
* Support adaptor in prepare_pin_version * Use adaptor in save_data * Use adaptor for default_title * underscore prefix for _adaptors.py; abstracting df_type in default_title * Removing duplication in _obj_name definition * Use adaptor in _create_meta * Pass pyright * Fix broken import * Refactoring type hints to avoid use of Self Various other type improvements * Remove singleton Union * Add databackend as a dependency * dev: add ruff to pyproject.toml * feat: allow save_data to accept an Adaptor * Remove unnecessary underscores * Remove misleading/unnecessary ClassVar declaration * Separate write_json from to_json (CQS) * Move calls to create_adapter to hide them at a lower level * Add some tests * Use backported typing_extensions.TypeAlias for Python 3.9 * add typing_extensions --------- Co-authored-by: Michael Chow <[email protected]> Co-authored-by: isabel zimmerman <[email protected]>
1 parent 3b5ade3 commit ef9f358

File tree

8 files changed

+476
-193
lines changed

8 files changed

+476
-193
lines changed

pins/_adaptors.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
from __future__ import annotations
2+
3+
import json
4+
from abc import abstractmethod
5+
from typing import TYPE_CHECKING, Any, ClassVar, overload
6+
7+
from databackend import AbstractBackend
8+
from typing_extensions import TypeAlias
9+
10+
if TYPE_CHECKING:
11+
import pandas as pd
12+
13+
PandasDataFrame: TypeAlias = pd.DataFrame
14+
DataFrame: TypeAlias = PandasDataFrame
15+
16+
17+
class AbstractPandasFrame(AbstractBackend):
18+
_backends = [("pandas", "DataFrame")]
19+
20+
21+
AbstractDF: TypeAlias = AbstractPandasFrame
22+
23+
24+
class Adaptor:
25+
def __init__(self, data: Any) -> None:
26+
self._d = data
27+
28+
def write_json(self, file: str) -> None:
29+
with open(file, "w") as f:
30+
f.write(self.to_json())
31+
32+
def to_json(self) -> str:
33+
import json
34+
35+
return json.dumps(self._d)
36+
37+
def write_joblib(self, file: str) -> None:
38+
import joblib
39+
40+
joblib.dump(self._d, file)
41+
42+
def write_csv(self, file: str) -> None:
43+
msg = f"Writing to CSV is not supported for {type(self._d)}"
44+
raise NotImplementedError(msg)
45+
46+
def write_parquet(self, file: str) -> None:
47+
msg = f"Writing to Parquet is not supported for {type(self._d)}"
48+
raise NotImplementedError(msg)
49+
50+
def write_feather(self, file: str) -> None:
51+
msg = f"Writing to Feather is not supported for {type(self._d)}"
52+
raise NotImplementedError(msg)
53+
54+
@property
55+
def data_preview(self) -> str:
56+
# note that the R library uses jsonlite::toJSON
57+
import json
58+
59+
# TODO(compat): set display none in index.html
60+
return json.dumps({})
61+
62+
def default_title(self, name: str) -> str:
63+
# TODO(compat): title says CSV rather than data.frame
64+
# see https://github.com/machow/pins-python/issues/5
65+
return f"{name}: a pinned {self._obj_name}"
66+
67+
@property
68+
def _obj_name(self) -> str:
69+
return f"{type(self._d).__qualname__} object"
70+
71+
72+
class DFAdaptor(Adaptor):
73+
_d: ClassVar[DataFrame]
74+
75+
def __init__(self, data: DataFrame) -> None:
76+
super().__init__(data)
77+
78+
@property
79+
def df_type(self) -> str:
80+
# Consider over-riding this for specialized dataframes
81+
return "DataFrame"
82+
83+
@property
84+
@abstractmethod
85+
def columns(self) -> list[Any]: ...
86+
87+
@property
88+
@abstractmethod
89+
def shape(self) -> tuple[int, int]: ...
90+
91+
@abstractmethod
92+
def head(self, n: int) -> DFAdaptor: ...
93+
94+
@property
95+
def data_preview(self) -> str:
96+
# TODO(compat) is 100 hard-coded?
97+
# Note that we go df -> json -> dict, to take advantage of type conversions in the dataframe library
98+
data: list[dict[Any, Any]] = json.loads(self.head(100).to_json())
99+
columns = [
100+
{"name": [col], "label": [col], "align": ["left"], "type": [""]}
101+
for col in self.columns
102+
]
103+
104+
# this reproduces R pins behavior, by omitting entries that would be null
105+
data_no_nulls = [{k: v for k, v in row.items() if v is not None} for row in data]
106+
107+
return json.dumps({"data": data_no_nulls, "columns": columns})
108+
109+
@property
110+
def _obj_name(self) -> str:
111+
row, col = self.shape
112+
return f"{row} x {col} {self.df_type}"
113+
114+
115+
class PandasAdaptor(DFAdaptor):
116+
_d: ClassVar[PandasDataFrame]
117+
118+
def __init__(self, data: AbstractPandasFrame) -> None:
119+
super().__init__(data)
120+
121+
@property
122+
def columns(self) -> list[Any]:
123+
return self._d.columns.tolist()
124+
125+
@property
126+
def shape(self) -> tuple[int, int]:
127+
return self._d.shape
128+
129+
def head(self, n: int) -> PandasAdaptor:
130+
return PandasAdaptor(self._d.head(n))
131+
132+
def to_json(self) -> str:
133+
return self._d.to_json(orient="records")
134+
135+
def write_csv(self, file: str) -> None:
136+
self._d.to_csv(file, index=False)
137+
138+
def write_parquet(self, file: str) -> None:
139+
self._d.to_parquet(file)
140+
141+
def write_feather(self, file: str) -> None:
142+
self._d.to_feather(file)
143+
144+
145+
@overload
146+
def create_adaptor(obj: DataFrame) -> DFAdaptor: ...
147+
@overload
148+
def create_adaptor(obj: Any) -> Adaptor: ...
149+
def create_adaptor(obj: Any | DataFrame) -> Adaptor | DFAdaptor:
150+
if isinstance(obj, AbstractPandasFrame):
151+
return PandasAdaptor(obj)
152+
elif isinstance(obj, Adaptor):
153+
return obj
154+
else:
155+
return Adaptor(obj)

pins/boards.py

Lines changed: 8 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@
1010
from datetime import datetime, timedelta
1111
from io import IOBase
1212
from pathlib import Path
13-
from typing import Protocol
13+
from typing import Any, Protocol
1414

1515
from importlib_resources import files
1616
from importlib_resources.abc import Traversable
1717

18+
from ._adaptors import Adaptor, create_adaptor
1819
from .cache import PinsCache
1920
from .config import get_allow_rsc_short_name
2021
from .drivers import REQUIRES_SINGLE_FILE, default_title, load_data, load_file, save_data
@@ -25,6 +26,8 @@
2526

2627
_log = logging.getLogger(__name__)
2728

29+
_ = default_title # Keep this import for backward compatibility
30+
2831

2932
class IFileSystem(Protocol):
3033
protocol: str | list
@@ -715,7 +718,7 @@ def prepare_pin_version(
715718
def _create_meta(
716719
self,
717720
pin_dir_path,
718-
x,
721+
x: Adaptor | Any,
719722
name: str | None = None,
720723
type: str | None = None,
721724
title: str | None = None,
@@ -732,7 +735,7 @@ def _create_meta(
732735
raise NotImplementedError("Type argument is required.")
733736

734737
if title is None:
735-
title = default_title(x, name)
738+
title = create_adaptor(x).default_title(name)
736739

737740
# create metadata from object on disk ---------------------------------
738741
# save all pin data to a temporary folder (including data.txt), so we
@@ -1223,46 +1226,17 @@ def prepare_pin_version(self, pin_dir_path, x, name: str | None, *args, **kwargs
12231226
# render index.html ------------------------------------------------
12241227

12251228
all_files = [meta.file] if isinstance(meta.file, str) else meta.file
1226-
pin_files = ", ".join(f"""<a href="{x}">{x}</a>""" for x in all_files)
1229+
pin_files = ", ".join(f"""<a href="{file}">{file}</a>""" for file in all_files)
12271230

12281231
context = {
12291232
"date": meta.version.created.replace(microsecond=0),
12301233
"pin_name": self.path_to_pin(name),
12311234
"pin_files": pin_files,
12321235
"pin_metadata": meta,
12331236
"board_deparse": board_deparse(self),
1237+
"data_preview": create_adaptor(x).data_preview,
12341238
}
12351239

1236-
# data preview ----
1237-
1238-
# TODO: move out data_preview logic? Can we draw some limits here?
1239-
# note that the R library uses jsonlite::toJSON
1240-
1241-
import json
1242-
1243-
import pandas as pd
1244-
1245-
if isinstance(x, pd.DataFrame):
1246-
# TODO(compat) is 100 hard-coded?
1247-
# Note that we go df -> json -> dict, to take advantage of pandas type conversions
1248-
data = json.loads(x.head(100).to_json(orient="records"))
1249-
columns = [
1250-
{"name": [col], "label": [col], "align": ["left"], "type": [""]}
1251-
for col in x
1252-
]
1253-
1254-
# this reproduces R pins behavior, by omitting entries that would be null
1255-
data_no_nulls = [
1256-
{k: v for k, v in row.items() if v is not None} for row in data
1257-
]
1258-
1259-
context["data_preview"] = json.dumps(
1260-
{"data": data_no_nulls, "columns": columns}
1261-
)
1262-
else:
1263-
# TODO(compat): set display none in index.html
1264-
context["data_preview"] = json.dumps({})
1265-
12661240
# do not show r code if not round-trip friendly
12671241
if meta.type in ["joblib"]:
12681242
context["show_r_style"] = "display:none"

pins/drivers.py

Lines changed: 19 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from collections.abc import Sequence
22
from pathlib import Path
3+
from typing import Any
4+
5+
from pins._adaptors import Adaptor, create_adaptor
36

47
from .config import PINS_ENV_INSECURE_READ, get_allow_pickle_read
58
from .errors import PinsInsecureReadError
@@ -13,15 +16,6 @@
1316
REQUIRES_SINGLE_FILE = frozenset(["csv", "joblib"])
1417

1518

16-
def _assert_is_pandas_df(x, file_type: str) -> None:
17-
import pandas as pd
18-
19-
if not isinstance(x, pd.DataFrame):
20-
raise NotImplementedError(
21-
f"Currently only pandas.DataFrame can be saved as type {file_type!r}."
22-
)
23-
24-
2519
def load_path(filename: str, path_to_version, pin_type=None):
2620
# file path creation ------------------------------------------------------
2721
if pin_type == "table":
@@ -126,7 +120,7 @@ def load_data(
126120

127121

128122
def save_data(
129-
obj, fname, pin_type=None, apply_suffix: bool = True
123+
obj: "Adaptor | Any", fname, pin_type=None, apply_suffix: bool = True
130124
) -> "str | Sequence[str]":
131125
# TODO: extensible saving with deferred importing
132126
# TODO: how to encode arguments to saving / loading drivers?
@@ -135,6 +129,11 @@ def save_data(
135129
# as argument to board, and then type dispatchers for explicit cases
136130
# of saving / loading objects different ways.
137131

132+
if isinstance(obj, Adaptor):
133+
adaptor, obj = obj, obj._d
134+
else:
135+
adaptor = create_adaptor(obj)
136+
138137
if apply_suffix:
139138
if pin_type == "file":
140139
suffix = "".join(Path(obj).suffixes)
@@ -149,39 +148,22 @@ def save_data(
149148
final_name = f"{fname}{suffix}"
150149

151150
if pin_type == "csv":
152-
_assert_is_pandas_df(obj, file_type=type)
153-
154-
obj.to_csv(final_name, index=False)
155-
151+
adaptor.write_csv(final_name)
156152
elif pin_type == "arrow":
157153
# NOTE: R pins accepts the type arrow, and saves it as feather.
158154
# we allow reading this type, but raise an error for writing.
159-
_assert_is_pandas_df(obj, file_type=type)
160-
161-
obj.to_feather(final_name)
162-
155+
adaptor.write_feather(final_name)
163156
elif pin_type == "feather":
164-
_assert_is_pandas_df(obj, file_type=type)
165-
166-
raise NotImplementedError(
157+
msg = (
167158
'Saving data as type "feather" no longer supported. Use type "arrow" instead.'
168159
)
169-
160+
raise NotImplementedError(msg)
170161
elif pin_type == "parquet":
171-
_assert_is_pandas_df(obj, file_type=type)
172-
173-
obj.to_parquet(final_name)
174-
162+
adaptor.write_parquet(final_name)
175163
elif pin_type == "joblib":
176-
import joblib
177-
178-
joblib.dump(obj, final_name)
179-
164+
adaptor.write_joblib(final_name)
180165
elif pin_type == "json":
181-
import json
182-
183-
json.dump(obj, open(final_name, "w"))
184-
166+
adaptor.write_json(final_name)
185167
elif pin_type == "file":
186168
import contextlib
187169
import shutil
@@ -202,14 +184,6 @@ def save_data(
202184
return final_name
203185

204186

205-
def default_title(obj, name):
206-
import pandas as pd
207-
208-
if isinstance(obj, pd.DataFrame):
209-
# TODO(compat): title says CSV rather than data.frame
210-
# see https://github.com/machow/pins-python/issues/5
211-
shape_str = " x ".join(map(str, obj.shape))
212-
return f"{name}: a pinned {shape_str} DataFrame"
213-
else:
214-
obj_name = type(obj).__qualname__
215-
return f"{name}: a pinned {obj_name} object"
187+
def default_title(obj: Any, name: str) -> str:
188+
# Kept for backward compatibility only.
189+
return create_adaptor(obj).default_title(name)

0 commit comments

Comments
 (0)