Skip to content

Commit eebbf00

Browse files
author
FelixAbrahamsson
committed
feature: function to verify that keys from old splits have not moved
1 parent 4052071 commit eebbf00

File tree

6 files changed

+47
-4
lines changed

6 files changed

+47
-4
lines changed

datastream/tools/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
from datastream.tools.split_dataframes import split_dataframes
66
from datastream.tools.unstratified_split import unstratified_split
77
from datastream.tools.stratified_split import stratified_split
8+
from datastream.tools.verify_split import verify_split

datastream/tools/verify_split.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import json
2+
from pathlib import Path
3+
from pydantic import validate_arguments
4+
5+
6+
@validate_arguments
7+
def verify_split(old_path: Path, new_path: Path):
8+
'''
9+
Verify that no keys from an old split are present in a different new split.
10+
11+
.. highlight:: python
12+
.. code-block:: python
13+
14+
verify_split(
15+
"path/to/old/split.json",
16+
"path/to/new/split.json",
17+
)
18+
19+
'''
20+
for old_split_name, old_split in json.loads(old_path.read_text()).items():
21+
for new_split_name, new_split in json.loads(new_path.read_text()).items():
22+
if (
23+
old_split_name != new_split_name
24+
and len(set(old_split).intersection(set(new_split))) > 0
25+
):
26+
raise ValueError(
27+
f'Some keys from old split "{old_split_name}"'
28+
f' are present in new split "{new_split_name}":\n'
29+
+ str("\n".join(
30+
[str(old_split[index]) for index in range(min(10, len(old_split)))]
31+
+ (["..."] if len(old_split) > 10 else [])
32+
))
33+
)

docs/source/datastream.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11

22
Datastream
33
=====================
4+
45
.. autoclass:: datastream.Datastream
56
:members:
67
:member-order: bysource

docs/source/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ sampling, and finally converting to a ``torch.utils.data.DataLoader``.
2020
get_started
2121
dataset
2222
datastream
23+
tools
2324

2425
Indices and tables
2526
==================

docs/source/requirements.txt

+6-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ certifi==2020.4.5.2
99
cffi==1.14.0
1010
chardet==3.0.4
1111
commonmark==0.9.1
12-
cryptography>=3.2.0
12+
cryptography==3.4.7
1313
decorator==4.4.2
1414
docutils==0.16
1515
idna==2.9
@@ -30,10 +30,11 @@ pkginfo==1.5.0.1
3030
pluggy==0.13.1
3131
py==1.10.0
3232
pycparser==2.20
33-
pydantic==1.5.1
33+
pydantic==1.8.1
3434
Pygments==2.7.4
3535
pylint==2.5.3
3636
pyparsing==2.4.7
37+
pyspark==2.4.1
3738
pytest==5.4.3
3839
python-dateutil==2.8.1
3940
pytz==2020.1
@@ -46,7 +47,7 @@ SecretStorage==3.1.2
4647
six==1.15.0
4748
snowballstemmer==2.0.0
4849
soupsieve==2.0.1
49-
Sphinx==3.1.0
50+
Sphinx==3.5.4
5051
sphinx-jsonschema==1.15
5152
sphinx-pydantic==0.1.1
5253
sphinx-rtd-theme==0.4.3
@@ -57,9 +58,10 @@ sphinxcontrib-jsmath==1.0.1
5758
sphinxcontrib-qthelp==1.0.3
5859
sphinxcontrib-serializinghtml==1.1.4
5960
toml==0.10.1
60-
torch==1.4.0
61+
torch==1.8.1
6162
tqdm==4.46.1
6263
twine==3.1.1
64+
typing-extensions==3.10.0.0
6365
urllib3==1.25.9
6466
waitress==1.4.4
6567
wcwidth==0.2.4

docs/source/tools.rst

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
tools
3+
=====================
4+
5+
.. autofunction:: datastream.tools.verify_split

0 commit comments

Comments
 (0)