From 6f17d8cb9e1288562b76411a5a19effdfcb6ada6 Mon Sep 17 00:00:00 2001 From: hossam26644 Date: Thu, 10 Apr 2025 15:37:54 +0100 Subject: [PATCH 1/2] Remove legacy HDF5 formats --- c/tskit/core.c | 2 +- c/tskit/core.h | 2 +- docs/file-formats.md | 2 +- python/CHANGELOG.rst | 7 +- python/_tskitmodule.c | 2 +- .../requirements/CI-complete/requirements.txt | 1 - .../CI-tests-conda/requirements.txt | 1 - python/requirements/development.txt | 3 +- python/requirements/development.yml | 1 - python/tests/test_cli.py | 78 +-- python/tests/test_file_format.py | 244 +------- python/tests/test_metadata.py | 7 +- python/tskit/__init__.py | 3 +- python/tskit/cli.py | 30 +- python/tskit/formats.py | 581 ------------------ python/tskit/util.py | 4 +- 16 files changed, 20 insertions(+), 948 deletions(-) delete mode 100644 python/tskit/formats.py diff --git a/c/tskit/core.c b/c/tskit/core.c index 886b52b23e..2b0078bf08 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -168,7 +168,7 @@ tsk_strerror_internal(int err) break; case TSK_ERR_FILE_VERSION_TOO_OLD: ret = "tskit file version too old. Please upgrade using the " - "'tskit upgrade' command. (TSK_ERR_FILE_VERSION_TOO_OLD)"; + "'tskit upgrade' command from tskit version<0.6.2. (TSK_ERR_FILE_VERSION_TOO_OLD)"; break; case TSK_ERR_FILE_VERSION_TOO_NEW: ret = "tskit file version is too new for this instance. " diff --git a/c/tskit/core.h b/c/tskit/core.h index 14664678b0..a4d022c2dd 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -283,7 +283,7 @@ A file could not be read because it is in the wrong format /** The file is in tskit format, but the version is too old for the library to read. The file should be upgraded to the latest version -using the ``tskit upgrade`` command line utility. +using the ``tskit upgrade`` command line utility from tskit version<0.6.2. */ #define TSK_ERR_FILE_VERSION_TOO_OLD -101 /** diff --git a/docs/file-formats.md b/docs/file-formats.md index ffa3c1ee9f..c17fbbb150 100644 --- a/docs/file-formats.md +++ b/docs/file-formats.md @@ -42,7 +42,7 @@ stored as well as the top-level metadata. ### Legacy Versions Tree sequence files written by older versions of tskit are not readable by -newer versions of tskit. For major releases of tskit, `tskit upgrade` +newer versions of tskit. For tskit releases<0.6.2, `tskit upgrade` will convert older tree sequence files to the latest version. diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index c93a9de9fd..1219d49689 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -17,6 +17,11 @@ - Metadata.schema was returning a modified schema, this is fixed to return a copy of the original schema instead (:user:`benjeffery`, :issue:`3129`, :pr:`3130`) +**Breaking Changes** + +- Legacy formats from msprime<0.6 (HDF5 formats) support is dropped. This includes the + support for ``tskit upgrade`` (:user:`hossam26644`, :issue:`2812`, :pr:`3138`) + -------------------- [0.6.1] - 2025-03-31 -------------------- @@ -142,7 +147,7 @@ pass a subset of nodes, so subtrees can be visually collapsed. Additionally, an option ``pack_untracked_polytomies`` allows large polytomies involving untracked samples to be summarised as a dotted line (:user:`hyanwong`, :issue:`3011` :pr:`3010`, :pr:`3012`) - + - Added a ``title`` parameter to ``.draw_svg()`` methods (:user:`hyanwong`, :pr:`3015`) - Add comma separation to all display numbers. (:user:`benjeffery`, :issue:`3017`, :pr:`3018`) diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 7c982cc408..e79a876467 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -223,7 +223,7 @@ handle_library_error(int err) const char *not_kas_format_msg = "File not in kastore format. Either the file is corrupt or it is not a " "tskit tree sequence file. It may be a legacy HDF file upgradable with " - "`tskit upgrade` or a compressed tree sequence file that can be decompressed " + "`tskit upgrade` from tskit version<0.6.2 or a compressed tree sequence file that can be decompressed " "with `tszip`."; const char *ibd_pairs_not_stored_msg = "Sample pairs are not stored by default " diff --git a/python/requirements/CI-complete/requirements.txt b/python/requirements/CI-complete/requirements.txt index 49f29e0c16..8920fa33ca 100644 --- a/python/requirements/CI-complete/requirements.txt +++ b/python/requirements/CI-complete/requirements.txt @@ -1,7 +1,6 @@ biopython==1.85 coverage==7.7.0 dendropy==5.0.1 -h5py==3.13.0 kastore==0.3.3 lshmm==0.0.8 msgpack==1.1.0 diff --git a/python/requirements/CI-tests-conda/requirements.txt b/python/requirements/CI-tests-conda/requirements.txt index ab771b8294..12c17302aa 100644 --- a/python/requirements/CI-tests-conda/requirements.txt +++ b/python/requirements/CI-tests-conda/requirements.txt @@ -1,4 +1,3 @@ msprime==1.3.3 tszip==0.2.5 -h5py==3.13.0 zarr<3 diff --git a/python/requirements/development.txt b/python/requirements/development.txt index dc4c572550..39653c87e7 100644 --- a/python/requirements/development.txt +++ b/python/requirements/development.txt @@ -6,7 +6,6 @@ codecov coverage dendropy flake8 -h5py>=2.6.0 jsonschema>=3.0.0 jupyter-book>=0.12.1 kastore @@ -22,7 +21,7 @@ numpy packaging portion pre-commit -pyparsing +pyparsing pysam pytest pytest-cov diff --git a/python/requirements/development.yml b/python/requirements/development.yml index 9f0a24ae4c..6aefcbd22a 100644 --- a/python/requirements/development.yml +++ b/python/requirements/development.yml @@ -13,7 +13,6 @@ dependencies: - dendropy - doxygen - flake8 - - h5py>=2.6.0 - jsonschema>=3.0.0 - jupyter-book>=0.12.1 - kastore diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index 67b3890c2e..cca5045123 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2024 Tskit Developers +# Copyright (c) 2018-2025 Tskit Developers # Copyright (c) 2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -30,7 +30,6 @@ import unittest from unittest import mock -import h5py import msprime import pytest @@ -311,16 +310,6 @@ def test_vcf_allow_position_zero(self, flags, expected): assert args.tree_sequence == tree_sequence assert args.allow_position_zero == expected - def test_upgrade_default_values(self): - parser = cli.get_tskit_parser() - cmd = "upgrade" - source = "in.trees" - destination = "out.trees" - args = parser.parse_args([cmd, source, destination]) - assert args.source == source - assert args.destination == destination - assert not args.remove_duplicate_positions - def test_info_default_values(self): parser = cli.get_tskit_parser() cmd = "info" @@ -655,68 +644,3 @@ def test_migrations(self): def test_provenances(self): self.verify("provenances") - - -class TestUpgrade(TestCli): - """ - Tests the results of the upgrade operation to ensure they are - correct. - """ - - def setUp(self): - fd, self.legacy_file_name = tempfile.mkstemp(prefix="msp_cli", suffix=".trees") - os.close(fd) - fd, self.current_file_name = tempfile.mkstemp(prefix="msp_cli", suffix=".trees") - os.close(fd) - - def tearDown(self): - os.unlink(self.legacy_file_name) - os.unlink(self.current_file_name) - - def test_conversion(self): - ts1 = msprime.simulate(10) - for version in [2, 3]: - tskit.dump_legacy(ts1, self.legacy_file_name, version=version) - stdout, stderr = capture_output( - cli.tskit_main, - ["upgrade", self.legacy_file_name, self.current_file_name], - ) - ts2 = tskit.load(self.current_file_name) - assert stdout == "" - assert stderr == "" - # Quick checks to ensure we have the right tree sequence. - # More thorough checks are done elsewhere. - assert ts1.get_sample_size() == ts2.get_sample_size() - assert ts1.num_edges == ts2.num_edges - assert ts1.get_num_trees() == ts2.get_num_trees() - - def test_duplicate_positions(self): - ts = msprime.simulate(10, mutation_rate=10) - for version in [2, 3]: - tskit.dump_legacy(ts, self.legacy_file_name, version=version) - root = h5py.File(self.legacy_file_name, "r+") - root["mutations/position"][:] = 0 - root.close() - stdout, stderr = capture_output( - cli.tskit_main, - ["upgrade", "-d", self.legacy_file_name, self.current_file_name], - ) - assert stdout == "" - tsp = tskit.load(self.current_file_name) - assert tsp.sample_size == ts.sample_size - assert tsp.num_sites == 1 - - def test_duplicate_positions_error(self): - ts = msprime.simulate(10, mutation_rate=10) - for version in [2, 3]: - tskit.dump_legacy(ts, self.legacy_file_name, version=version) - root = h5py.File(self.legacy_file_name, "r+") - root["mutations/position"][:] = 0 - root.close() - with mock.patch("sys.exit", side_effect=TestException) as mocked_exit: - with pytest.raises(TestException): - capture_output( - cli.tskit_main, - ["upgrade", self.legacy_file_name, self.current_file_name], - ) - assert mocked_exit.call_count == 1 diff --git a/python/tests/test_file_format.py b/python/tests/test_file_format.py index 2de38c487e..ff35dc90bc 100644 --- a/python/tests/test_file_format.py +++ b/python/tests/test_file_format.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2023 Tskit Developers +# Copyright (c) 2018-2025 Tskit Developers # Copyright (c) 2016-2018 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -23,15 +23,11 @@ """ Test cases for tskit's file format. """ -import json import os -import sys import tempfile import unittest import uuid as _uuid -from unittest import mock -import h5py import kastore import msprime import numpy as np @@ -274,249 +270,17 @@ def test_format_too_old_raised_for_hdf5(self): ): tskit.TableCollection.load(path) - def test_msprime_v_0_5_0(self): - path = os.path.join(test_data_dir, "hdf5-formats", "msprime-0.5.0_v10.0.hdf5") - ts = tskit.load_legacy(path) - self.verify_tree_sequence(ts) - - def test_msprime_v_0_4_0(self): - path = os.path.join(test_data_dir, "hdf5-formats", "msprime-0.4.0_v3.1.hdf5") - ts = tskit.load_legacy(path) - self.verify_tree_sequence(ts) - - def test_msprime_v_0_3_0(self): - path = os.path.join(test_data_dir, "hdf5-formats", "msprime-0.3.0_v2.0.hdf5") - ts = tskit.load_legacy(path) - self.verify_tree_sequence(ts) - def test_tskit_v_0_3_3(self): path = os.path.join(test_data_dir, "old-formats", "tskit-0.3.3.trees") ts = tskit.load(path) self.verify_tree_sequence(ts) -class TestRoundTrip(TestFileFormat): - """ - Tests if we can round trip convert a tree sequence in memory - through a V2 file format and a V3 format. - """ - - def verify_tree_sequences_equal(self, ts, tsp, simplify=True): - assert ts.sequence_length == tsp.sequence_length - t1 = ts.dump_tables() - # We need to sort and squash the edges in the new format because it - # has gone through an edgesets representation. Simplest way to do this - # is to call simplify. - if simplify: - t2 = tsp.simplify().tables - else: - t2 = tsp.tables - assert t1.nodes == t2.nodes - assert t1.edges == t2.edges - assert t1.sites == t2.sites - # The old formats can't represent mutation times so null them out. - t1.mutations.time = np.full_like(t1.mutations.time, tskit.UNKNOWN_TIME) - assert t1.mutations == t2.mutations - - def verify_round_trip(self, ts, version): - tskit.dump_legacy(ts, self.temp_file, version=version) - tsp = tskit.load_legacy(self.temp_file) - simplify = version < 10 - self.verify_tree_sequences_equal(ts, tsp, simplify=simplify) - tsp.dump(self.temp_file) - tsp = tskit.load(self.temp_file) - self.verify_tree_sequences_equal(ts, tsp, simplify=simplify) - for provenance in tsp.provenances(): - tskit.validate_provenance(json.loads(provenance.record)) - - def verify_round_trip_no_legacy(self, ts): - ts.dump(self.temp_file) - tsp = tskit.load(self.temp_file) - self.verify_tree_sequences_equal(ts, tsp, simplify=False) - for provenance in tsp.provenances(): - tskit.validate_provenance(json.loads(provenance.record)) - - def verify_malformed_json_v2(self, ts, group_name, attr, bad_json): - tskit.dump_legacy(ts, self.temp_file, 2) - # Write some bad JSON to the provenance string. - root = h5py.File(self.temp_file, "r+") - group = root[group_name] - group.attrs[attr] = bad_json - root.close() - tsp = tskit.load_legacy(self.temp_file) - self.verify_tree_sequences_equal(ts, tsp) - - def test_malformed_json_v2(self): - ts = multi_locus_with_mutation_example() - for group_name in ["trees", "mutations"]: - for attr in ["environment", "parameters"]: - for bad_json in ["", "{", "{},"]: - self.verify_malformed_json_v2(ts, group_name, attr, bad_json) - - def test_single_locus_no_mutation(self): - self.verify_round_trip(single_locus_no_mutation_example(), 2) - self.verify_round_trip(single_locus_no_mutation_example(), 3) - self.verify_round_trip(single_locus_no_mutation_example(), 10) - - def test_single_locus_with_mutation(self): - self.verify_round_trip(single_locus_with_mutation_example(), 2) - self.verify_round_trip(single_locus_with_mutation_example(), 3) - self.verify_round_trip(single_locus_with_mutation_example(), 10) - - def test_multi_locus_with_mutation(self): - self.verify_round_trip(multi_locus_with_mutation_example(), 2) - self.verify_round_trip(multi_locus_with_mutation_example(), 3) - self.verify_round_trip(multi_locus_with_mutation_example(), 10) - - def test_migration_example(self): - self.verify_round_trip(migration_example(), 2) - self.verify_round_trip(migration_example(), 3) - self.verify_round_trip(migration_example(), 10) - - def test_bottleneck_example(self): - self.verify_round_trip(migration_example(), 3) - self.verify_round_trip(migration_example(), 10) - - def test_no_provenance(self): - self.verify_round_trip(no_provenance_example(), 10) - - def test_provenance_timestamp_only(self): - self.verify_round_trip(provenance_timestamp_only_example(), 10) - - def test_recurrent_mutation_example(self): - ts = recurrent_mutation_example() - for version in [2, 3]: - with pytest.raises(ValueError): - tskit.dump_legacy(ts, self.temp_file, version) - self.verify_round_trip(ts, 10) - - def test_general_mutation_example(self): - ts = general_mutation_example() - for version in [2, 3]: - with pytest.raises(ValueError): - tskit.dump_legacy(ts, self.temp_file, version) - self.verify_round_trip(ts, 10) - - def test_node_metadata_example(self): - self.verify_round_trip(node_metadata_example(), 10) - - def test_site_metadata_example(self): - self.verify_round_trip(site_metadata_example(), 10) - - def test_mutation_metadata_example(self): - self.verify_round_trip(mutation_metadata_example(), 10) - - def test_migration_metadata_example(self): - self.verify_round_trip(migration_metadata_example(), 10) - - def test_edge_metadata_example(self): - # metadata for edges was introduced - self.verify_round_trip_no_legacy(edge_metadata_example()) - - def test_multichar_mutation_example(self): - self.verify_round_trip(multichar_mutation_example(), 10) - - def test_empty_file(self): - tables = tskit.TableCollection(sequence_length=3) - self.verify_round_trip(tables.tree_sequence(), 10) - - def test_zero_edges(self): - tables = tskit.TableCollection(sequence_length=3) - tables.nodes.add_row(time=0) - self.verify_round_trip(tables.tree_sequence(), 10) - - def test_v2_no_samples(self): - ts = multi_locus_with_mutation_example() - tskit.dump_legacy(ts, self.temp_file, version=2) - root = h5py.File(self.temp_file, "r+") - del root["samples"] - root.close() - tsp = tskit.load_legacy(self.temp_file) - self.verify_tree_sequences_equal(ts, tsp) - - def test_duplicate_mutation_positions_single_value(self): - ts = multi_locus_with_mutation_example() - for version in [2, 3]: - tskit.dump_legacy(ts, self.temp_file, version=version) - root = h5py.File(self.temp_file, "r+") - root["mutations/position"][:] = 0 - root.close() - with pytest.raises(tskit.DuplicatePositionsError): - tskit.load_legacy(self.temp_file) - tsp = tskit.load_legacy(self.temp_file, remove_duplicate_positions=True) - assert tsp.num_sites == 1 - sites = list(tsp.sites()) - assert sites[0].position == 0 - - def test_duplicate_mutation_positions(self): - ts = multi_locus_with_mutation_example() - for version in [2, 3]: - tskit.dump_legacy(ts, self.temp_file, version=version) - root = h5py.File(self.temp_file, "r+") - position = np.array(root["mutations/position"]) - position[0] = position[1] - root["mutations/position"][:] = position - root.close() - with pytest.raises(tskit.DuplicatePositionsError): - tskit.load_legacy(self.temp_file) - tsp = tskit.load_legacy(self.temp_file, remove_duplicate_positions=True) - assert tsp.num_sites == position.shape[0] - 1 - position_after = list(s.position for s in tsp.sites()) - assert list(position[1:]) == position_after - - class TestErrors(TestFileFormat): """ Test various API errors. """ - def test_v2_non_binary_records(self): - demographic_events = [ - msprime.SimpleBottleneck(time=0.01, population=0, proportion=1) - ] - ts = msprime.simulate( - sample_size=10, demographic_events=demographic_events, random_seed=1 - ) - with pytest.raises(ValueError): - tskit.dump_legacy(ts, self.temp_file, 2) - - def test_unsupported_version(self): - ts = msprime.simulate(10) - with pytest.raises(ValueError): - tskit.dump_legacy(ts, self.temp_file, version=4) - # Cannot read current files. - ts.dump(self.temp_file) - # Catch Exception here because h5py throws different exceptions on py2 and py3 - with pytest.raises(Exception): # noqa B017 - tskit.load_legacy(self.temp_file) - - def test_no_version_number(self): - root = h5py.File(self.temp_file, "w") - root.attrs["x"] = 0 - root.close() - with pytest.raises(ValueError): - tskit.load_legacy(self.temp_file) - - def test_unknown_legacy_version(self): - root = h5py.File(self.temp_file, "w") - root.attrs["format_version"] = (1024, 0) # Arbitrary unknown version - root.close() - with pytest.raises(ValueError): - tskit.load_legacy(self.temp_file) - - def test_no_h5py(self): - ts = msprime.simulate(10) - path = os.path.join(test_data_dir, "hdf5-formats", "msprime-0.3.0_v2.0.hdf5") - msg = ( - "Legacy formats require h5py. Install via `pip install h5py` or" - " `conda install h5py`" - ) - with mock.patch.dict(sys.modules, {"h5py": None}): - with pytest.raises(ImportError, match=msg): - tskit.load_legacy(path) - with pytest.raises(ImportError, match=msg): - tskit.dump_legacy(ts, path) - def test_tszip_file(self): ts = msprime.simulate(5) tszip.compress(ts, self.temp_file) @@ -1103,12 +867,6 @@ def test_load_empty_kastore(self): with pytest.raises(exceptions.LibraryError): tskit.load(self.temp_file) - def test_load_non_tskit_hdf5(self): - with h5py.File(self.temp_file, "w") as root: - root["x"] = np.zeros(10) - with pytest.raises(exceptions.FileFormatError): - tskit.load(self.temp_file) - def test_old_version_load_error(self): ts = msprime.simulate(10, random_seed=1) for bad_version in [(0, 1), (0, 8), (2, 0), (CURRENT_FILE_MAJOR - 1, 0)]: diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index ce999e3fc3..edecb44bde 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -44,14 +44,13 @@ import tskit.metadata as metadata -class TestMetadataHdf5RoundTrip(unittest.TestCase): +class TestMetadataRoundTrip(unittest.TestCase): """ - Tests that we can encode metadata under various formats and this will - successfully round-trip through the HDF5 format. + Tests that we can encode metadata under various formats. """ def setUp(self): - fd, self.temp_file = tempfile.mkstemp(prefix="msp_hdf5meta_test_") + fd, self.temp_file = tempfile.mkstemp(prefix="msp_meta_test_") os.close(fd) def tearDown(self): diff --git a/python/tskit/__init__.py b/python/tskit/__init__.py index df920f9b45..5777064b31 100644 --- a/python/tskit/__init__.py +++ b/python/tskit/__init__.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2024 Tskit Developers +# Copyright (c) 2018-2025 Tskit Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -73,7 +73,6 @@ from tskit.provenance import __version__ # NOQA from tskit.provenance import validate_provenance # NOQA -from tskit.formats import * # NOQA from tskit.trees import * # NOQA from tskit.genotypes import Variant # NOQA from tskit.tables import * # NOQA diff --git a/python/tskit/cli.py b/python/tskit/cli.py index b20bd260d9..7b00e6888d 100644 --- a/python/tskit/cli.py +++ b/python/tskit/cli.py @@ -1,7 +1,7 @@ # # MIT License # -# Copyright (c) 2018-2024 Tskit Developers +# Copyright (c) 2018-2025 Tskit Developers # Copyright (c) 2015-2018 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -68,19 +68,6 @@ def run_trees(args): print(tree.draw(format="unicode")) -def run_upgrade(args): - try: - tree_sequence = tskit.load_legacy(args.source, args.remove_duplicate_positions) - tree_sequence.dump(args.destination) - except tskit.DuplicatePositionsError: - sys_exit( - "Error: Duplicate mutation positions in the source file detected.\n\n" - 'This is not supported in the current file format. Running "upgrade -d" ' - "will remove these duplicate positions. However, this will result in loss " - "of data from the original file!" - ) - - def run_individuals(args): tree_sequence = load_tree_sequence(args.tree_sequence) tree_sequence.dump_text(individuals=sys.stdout, precision=args.precision) @@ -183,21 +170,6 @@ def get_tskit_parser(): ) parser.set_defaults(runner=run_trees) - parser = subparsers.add_parser( - "upgrade", help="Upgrade legacy tree sequence files." - ) - parser.add_argument( - "source", help="The source tskit tree sequence file in legacy format" - ) - parser.add_argument("destination", help="The filename of the upgraded copy.") - parser.add_argument( - "--remove-duplicate-positions", - "-d", - action="store_true", - default=False, - help="Remove any duplicated mutation positions in the source file. ", - ) - parser.set_defaults(runner=run_upgrade) # suppress fasta visibility until we have a reference sequence # See https://github.com/tskit-dev/tskit/issues/1888 # parser = subparsers.add_parser( diff --git a/python/tskit/formats.py b/python/tskit/formats.py deleted file mode 100644 index 76d0ad7376..0000000000 --- a/python/tskit/formats.py +++ /dev/null @@ -1,581 +0,0 @@ -# MIT License -# -# Copyright (c) 2018-2024 Tskit Developers -# Copyright (c) 2016-2017 University of Oxford -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -""" -Module responsible for converting tree sequence files from older -formats. -""" -import datetime -import json -import logging - -import numpy as np - -import tskit -import tskit.exceptions as exceptions -import tskit.provenance as provenance - - -def _get_v2_provenance(command, attrs): - """ - Returns the V2 tree provenance attributes reformatted as a provenance record. - """ - environment = {} - parameters = {} - # Try to get the provenance strings. Malformed JSON should not prevent us - # from finishing the conversion. - try: - environment = json.loads(str(attrs["environment"])) - except ValueError: - logging.warning("Failed to convert environment provenance") - try: - parameters = json.loads(str(attrs["parameters"])) - except ValueError: - logging.warning("Failed to convert parameters provenance") - parameters["command"] = command - provenance_dict = provenance.get_provenance_dict(parameters) - provenance_dict["version"] = environment.get("msprime_version", "Unknown_version") - provenance_dict["environment"] = environment - return json.dumps(provenance_dict).encode() - - -def _get_upgrade_provenance(root): - """ - Returns the provenance string from upgrading the specified HDF5 file. - """ - # TODO add more parameters here like filename, etc. - parameters = { - "command": "upgrade", - "source_version": list(map(int, root.attrs["format_version"])), - } - s = json.dumps(provenance.get_provenance_dict(parameters)) - return s.encode() - - -def _convert_hdf5_mutations( - mutations_group, sites, mutations, remove_duplicate_positions -): - """ - Loads the v2/v3 into the specified tables. - """ - position = np.array(mutations_group["position"]) - node = np.array(mutations_group["node"], dtype=np.int32) - unique_position, index = np.unique(position, return_index=True) - if unique_position.shape != position.shape: - if remove_duplicate_positions: - position = position[index] - node = node[index] - else: - # TODO add the number of duplicates so that we can improve the - # error message. - raise exceptions.DuplicatePositionsError() - num_mutations = position.shape[0] - sites.set_columns( - position=position, - ancestral_state=ord("0") * np.ones(num_mutations, dtype=np.int8), - ancestral_state_offset=np.arange(num_mutations + 1, dtype=np.uint32), - ) - mutations.set_columns( - node=node, - site=np.arange(num_mutations, dtype=np.int32), - derived_state=ord("1") * np.ones(num_mutations, dtype=np.int8), - derived_state_offset=np.arange(num_mutations + 1, dtype=np.uint32), - ) - - -def _set_populations(tables): - """ - Updates PopulationTable suitable to represent the populations referred to - in the node table. - """ - if len(tables.nodes) > 0: - for _ in range(np.max(tables.nodes.population) + 1): - tables.populations.add_row() - - -def _load_legacy_hdf5_v2(root, remove_duplicate_positions): - # Get the coalescence records - trees_group = root["trees"] - old_timestamp = datetime.datetime.min.isoformat() - provenances = tskit.ProvenanceTable() - provenances.add_row( - timestamp=old_timestamp, - record=_get_v2_provenance("generate_trees", trees_group.attrs), - ) - num_rows = trees_group["node"].shape[0] - index = np.arange(num_rows, dtype=int) - parent = np.zeros(2 * num_rows, dtype=np.int32) - parent[2 * index] = trees_group["node"] - parent[2 * index + 1] = trees_group["node"] - left = np.zeros(2 * num_rows, dtype=np.float64) - left[2 * index] = trees_group["left"] - left[2 * index + 1] = trees_group["left"] - right = np.zeros(2 * num_rows, dtype=np.float64) - right[2 * index] = trees_group["right"] - right[2 * index + 1] = trees_group["right"] - child = np.array(trees_group["children"], dtype=np.int32).flatten() - - tables = tskit.TableCollection(np.max(right)) - tables.edges.set_columns(left=left, right=right, parent=parent, child=child) - - cr_node = np.array(trees_group["node"], dtype=np.int32) - num_nodes = max(np.max(child), np.max(cr_node)) + 1 - sample_size = np.min(cr_node) - flags = np.zeros(num_nodes, dtype=np.uint32) - population = np.zeros(num_nodes, dtype=np.int32) - time = np.zeros(num_nodes, dtype=np.float64) - flags[:sample_size] = tskit.NODE_IS_SAMPLE - cr_population = np.array(trees_group["population"], dtype=np.int32) - cr_time = np.array(trees_group["time"]) - time[cr_node] = cr_time - population[cr_node] = cr_population - if "samples" in root: - samples_group = root["samples"] - population[:sample_size] = np.array(samples_group["population"], copy=True) - if "time" in samples_group: - time[:sample_size] = np.array(samples_group["time"], copy=True) - tables.nodes.set_columns(flags=flags, population=population, time=time) - _set_populations(tables) - - if "mutations" in root: - mutations_group = root["mutations"] - _convert_hdf5_mutations( - mutations_group, tables.sites, tables.mutations, remove_duplicate_positions - ) - provenances.add_row( - timestamp=old_timestamp, - record=_get_v2_provenance("generate_mutations", mutations_group.attrs), - ) - tables.provenances.add_row(_get_upgrade_provenance(root)) - tables.sort() - return tables.tree_sequence() - - -def _load_legacy_hdf5_v3(root, remove_duplicate_positions): - # get the trees group for the records and samples - trees_group = root["trees"] - nodes_group = trees_group["nodes"] - time = np.array(nodes_group["time"]) - - breakpoints = np.array(trees_group["breakpoints"]) - records_group = trees_group["records"] - left_indexes = np.array(records_group["left"]) - right_indexes = np.array(records_group["right"]) - record_node = np.array(records_group["node"], dtype=np.int32) - num_nodes = time.shape[0] - sample_size = np.min(record_node) - flags = np.zeros(num_nodes, dtype=np.uint32) - flags[:sample_size] = tskit.NODE_IS_SAMPLE - - children_length = np.array(records_group["num_children"], dtype=np.uint32) - total_rows = np.sum(children_length) - left = np.zeros(total_rows, dtype=np.float64) - right = np.zeros(total_rows, dtype=np.float64) - parent = np.zeros(total_rows, dtype=np.int32) - record_left = breakpoints[left_indexes] - record_right = breakpoints[right_indexes] - k = 0 - for j in range(left_indexes.shape[0]): - for _ in range(children_length[j]): - left[k] = record_left[j] - right[k] = record_right[j] - parent[k] = record_node[j] - k += 1 - tables = tskit.TableCollection(np.max(right)) - tables.nodes.set_columns( - flags=flags, time=nodes_group["time"], population=nodes_group["population"] - ) - _set_populations(tables) - tables.edges.set_columns( - left=left, right=right, parent=parent, child=records_group["children"] - ) - if "mutations" in root: - _convert_hdf5_mutations( - root["mutations"], - tables.sites, - tables.mutations, - remove_duplicate_positions, - ) - old_timestamp = datetime.datetime.min.isoformat() - if "provenance" in root: - for record in root["provenance"]: - tables.provenances.add_row(timestamp=old_timestamp, record=record) - tables.provenances.add_row(_get_upgrade_provenance(root)) - tables.sort() - return tables.tree_sequence() - - -def get_h5py(): - try: - import h5py - except ImportError: - raise ImportError( - "Legacy formats require h5py. Install via `pip install h5py`" - " or `conda install h5py`" - ) - return h5py - - -def load_legacy(filename, remove_duplicate_positions=False): - """ - Reads the specified msprime HDF5 file and returns a tree sequence. This - method is only intended to be used to read old format HDF5 files. - - If remove_duplicate_positions is True, remove all sites (except the - first) that contain duplicate positions. If this is False, any input - files that contain duplicate positions will raise an DuplicatePositionsError. - """ - loaders = { - 2: _load_legacy_hdf5_v2, - 3: _load_legacy_hdf5_v3, - 10: _load_legacy_hdf5_v10, - } - h5py = get_h5py() - root = h5py.File(filename, "r") - if "format_version" not in root.attrs: - raise ValueError("HDF5 file not in msprime format") - format_version = root.attrs["format_version"] - if format_version[0] not in loaders: - raise ValueError(f"Version {format_version} not supported for loading") - try: - ts = loaders[format_version[0]](root, remove_duplicate_positions) - finally: - root.close() - return ts - - -def _dump_legacy_hdf5_v2(tree_sequence, root): - root.attrs["format_version"] = (2, 999) - root.attrs["sample_size"] = tree_sequence.get_sample_size() - root.attrs["sequence_length"] = (tree_sequence.get_sequence_length(),) - left = [] - right = [] - node = [] - children = [] - time = [] - population = [] - for record in tree_sequence.records(): - left.append(record.left) - right.append(record.right) - node.append(record.node) - if len(record.children) != 2: - raise ValueError("V2 files only support binary records") - children.append(record.children) - time.append(record.time) - population.append(record.population) - length = len(time) - trees = root.create_group("trees") - trees.attrs["environment"] = json.dumps({"msprime_version": 0}) - trees.attrs["parameters"] = "{}" - trees.create_dataset("left", (length,), data=left, dtype=float) - trees.create_dataset("right", (length,), data=right, dtype=float) - trees.create_dataset("time", (length,), data=time, dtype=float) - trees.create_dataset("node", (length,), data=node, dtype="u4") - trees.create_dataset("population", (length,), data=population, dtype="u1") - trees.create_dataset("children", (length, 2), data=children, dtype="u4") - samples = root.create_group("samples") - population = [] - time = [] - length = tree_sequence.get_sample_size() - for u in range(length): - time.append(tree_sequence.get_time(u)) - population.append(tree_sequence.get_population(u)) - samples.create_dataset("time", (length,), data=time, dtype=float) - samples.create_dataset("population", (length,), data=population, dtype="u1") - if tree_sequence.get_num_mutations() > 0: - node = [] - position = [] - for site in tree_sequence.sites(): - if len(site.mutations) != 1: - raise ValueError("v2 does not support recurrent mutations") - if site.ancestral_state != "0" or site.mutations[0].derived_state != "1": - raise ValueError("v2 does not support non-binary mutations") - position.append(site.position) - node.append(site.mutations[0].node) - length = len(node) - mutations = root.create_group("mutations") - mutations.attrs["environment"] = json.dumps({"msprime_version": 0}) - mutations.attrs["parameters"] = "{}" - mutations.create_dataset("position", (length,), data=position, dtype=float) - mutations.create_dataset("node", (length,), data=node, dtype="u4") - - -def _dump_legacy_hdf5_v3(tree_sequence, root): - root.attrs["format_version"] = (3, 999) - root.attrs["sample_size"] = (0,) - root.attrs["sequence_length"] = (0,) - trees = root.create_group("trees") - # Get the breakpoints from the records. - left = [cr.left for cr in tree_sequence.records()] - breakpoints = np.unique(left + [tree_sequence.sequence_length]) - trees.create_dataset( - "breakpoints", (len(breakpoints),), data=breakpoints, dtype=float - ) - - left = [] - right = [] - node = [] - children = [] - num_children = [] - time = [] - for cr in tree_sequence.records(): - node.append(cr.node) - left.append(np.searchsorted(breakpoints, cr.left)) - right.append(np.searchsorted(breakpoints, cr.right)) - children.extend(cr.children) - num_children.append(len(cr.children)) - time.append(cr.time) - records_group = trees.create_group("records") - length = len(num_children) - records_group.create_dataset("left", (length,), data=left, dtype="u4") - records_group.create_dataset("right", (length,), data=right, dtype="u4") - records_group.create_dataset("node", (length,), data=node, dtype="u4") - records_group.create_dataset( - "num_children", (length,), data=num_children, dtype="u4" - ) - records_group.create_dataset( - "children", (len(children),), data=children, dtype="u4" - ) - - indexes_group = trees.create_group("indexes") - left_index = sorted(range(length), key=lambda j: (left[j], time[j])) - right_index = sorted(range(length), key=lambda j: (right[j], -time[j])) - indexes_group.create_dataset( - "insertion_order", (length,), data=left_index, dtype="u4" - ) - indexes_group.create_dataset( - "removal_order", (length,), data=right_index, dtype="u4" - ) - - nodes_group = trees.create_group("nodes") - population = np.zeros(tree_sequence.num_nodes, dtype="u4") - time = np.zeros(tree_sequence.num_nodes, dtype=float) - tree = next(tree_sequence.trees()) - for u in range(tree_sequence.sample_size): - population[u] = tree.population(u) - time[u] = tree.time(u) - for cr in tree_sequence.records(): - population[cr.node] = cr.population - time[cr.node] = cr.time - length = tree_sequence.num_nodes - nodes_group.create_dataset("time", (length,), data=time, dtype=float) - nodes_group.create_dataset("population", (length,), data=population, dtype="u4") - - node = [] - position = [] - for site in tree_sequence.sites(): - if len(site.mutations) != 1: - raise ValueError("v3 does not support recurrent mutations") - if site.ancestral_state != "0" or site.mutations[0].derived_state != "1": - raise ValueError("v3 does not support non-binary mutations") - position.append(site.position) - node.append(site.mutations[0].node) - length = len(position) - if length > 0: - mutations = root.create_group("mutations") - mutations.create_dataset("position", (length,), data=position, dtype=float) - mutations.create_dataset("node", (length,), data=node, dtype="u4") - - -def _add_dataset(group, name, data): - # In the HDF5 format any zero-d arrays must be excluded. - if data.shape[0] > 0: - group.create_dataset(name, data=data) - - -def _dump_legacy_hdf5_v10(tree_sequence, root): - root.attrs["format_version"] = (10, 999) - root.attrs["sample_size"] = (0,) - root.attrs["sequence_length"] = (tree_sequence.sequence_length,) - tables = tree_sequence.dump_tables() - - nodes = root.create_group("nodes") - _add_dataset(nodes, "time", tables.nodes.time) - _add_dataset(nodes, "flags", tables.nodes.flags) - _add_dataset(nodes, "population", tables.nodes.population) - _add_dataset(nodes, "metadata", tables.nodes.metadata) - _add_dataset(nodes, "metadata_offset", tables.nodes.metadata_offset) - - edges = root.create_group("edges") - if len(tables.edges) > 0: - edges.create_dataset("left", data=tables.edges.left) - edges.create_dataset("right", data=tables.edges.right) - edges.create_dataset("parent", data=tables.edges.parent) - edges.create_dataset("child", data=tables.edges.child) - - left = tables.edges.left - right = tables.edges.right - time = tables.nodes.time[tables.edges.parent] - # We can do this more efficiently if we ever need to do it for anything - # other than testing. - indexes_group = edges.create_group("indexes") - length = len(tables.edges) - left_index = sorted(range(length), key=lambda j: (left[j], time[j])) - right_index = sorted(range(length), key=lambda j: (right[j], -time[j])) - indexes_group.create_dataset("insertion_order", data=left_index, dtype="u4") - indexes_group.create_dataset("removal_order", data=right_index, dtype="u4") - - migrations = root.create_group("migrations") - if len(tables.migrations) > 0: - migrations.create_dataset("left", data=tables.migrations.left) - migrations.create_dataset("right", data=tables.migrations.right) - migrations.create_dataset("node", data=tables.migrations.node) - migrations.create_dataset("source", data=tables.migrations.source) - migrations.create_dataset("dest", data=tables.migrations.dest) - migrations.create_dataset("time", data=tables.migrations.time) - - sites = root.create_group("sites") - _add_dataset(sites, "position", tables.sites.position) - _add_dataset(sites, "ancestral_state", tables.sites.ancestral_state) - _add_dataset(sites, "ancestral_state_offset", tables.sites.ancestral_state_offset) - _add_dataset(sites, "metadata", tables.sites.metadata) - _add_dataset(sites, "metadata_offset", tables.sites.metadata_offset) - - mutations = root.create_group("mutations") - _add_dataset(mutations, "site", tables.mutations.site) - _add_dataset(mutations, "node", tables.mutations.node) - _add_dataset(mutations, "parent", tables.mutations.parent) - _add_dataset(mutations, "derived_state", tables.mutations.derived_state) - _add_dataset( - mutations, "derived_state_offset", tables.mutations.derived_state_offset - ) - _add_dataset(mutations, "metadata", tables.mutations.metadata) - _add_dataset(mutations, "metadata_offset", tables.mutations.metadata_offset) - - provenances = root.create_group("provenances") - _add_dataset(provenances, "timestamp", tables.provenances.timestamp) - _add_dataset(provenances, "timestamp_offset", tables.provenances.timestamp_offset) - _add_dataset(provenances, "record", tables.provenances.record) - _add_dataset(provenances, "record_offset", tables.provenances.record_offset) - - -def _load_legacy_hdf5_v10(root, remove_duplicate_positions=False): - # We cannot have duplicate positions in v10, so this parameter is ignored - sequence_length = root.attrs["sequence_length"] - try: - sequence_length = sequence_length[0] - except TypeError: - pass - tables = tskit.TableCollection(sequence_length) - - nodes_group = root["nodes"] - metadata = None - metadata_offset = None - if "metadata" in nodes_group: - metadata = nodes_group["metadata"] - metadata_offset = nodes_group["metadata_offset"] - if "flags" in nodes_group: - tables.nodes.set_columns( - flags=nodes_group["flags"], - population=nodes_group["population"], - time=nodes_group["time"], - metadata=metadata, - metadata_offset=metadata_offset, - ) - - edges_group = root["edges"] - if "left" in edges_group: - tables.edges.set_columns( - left=edges_group["left"], - right=edges_group["right"], - parent=edges_group["parent"], - child=edges_group["child"], - ) - - migrations_group = root["migrations"] - if "left" in migrations_group: - tables.migrations.set_columns( - left=migrations_group["left"], - right=migrations_group["right"], - node=migrations_group["node"], - source=migrations_group["source"], - dest=migrations_group["dest"], - time=migrations_group["time"], - ) - - sites_group = root["sites"] - if "position" in sites_group: - metadata = None - metadata_offset = None - if "metadata" in sites_group: - metadata = sites_group["metadata"] - metadata_offset = sites_group["metadata_offset"] - tables.sites.set_columns( - position=sites_group["position"], - ancestral_state=sites_group["ancestral_state"], - ancestral_state_offset=sites_group["ancestral_state_offset"], - metadata=metadata, - metadata_offset=metadata_offset, - ) - - mutations_group = root["mutations"] - if "site" in mutations_group: - metadata = None - metadata_offset = None - if "metadata" in mutations_group: - metadata = mutations_group["metadata"] - metadata_offset = mutations_group["metadata_offset"] - tables.mutations.set_columns( - site=mutations_group["site"], - node=mutations_group["node"], - parent=mutations_group["parent"], - derived_state=mutations_group["derived_state"], - derived_state_offset=mutations_group["derived_state_offset"], - metadata=metadata, - metadata_offset=metadata_offset, - ) - - provenances_group = root["provenances"] - if "timestamp" in provenances_group: - timestamp = provenances_group["timestamp"] - timestamp_offset = provenances_group["timestamp_offset"] - record = provenances_group["record"] - record_offset = provenances_group["record_offset"] - tables.provenances.set_columns( - timestamp=timestamp, - timestamp_offset=timestamp_offset, - record=record, - record_offset=record_offset, - ) - tables.provenances.add_row(_get_upgrade_provenance(root)) - _set_populations(tables) - return tables.tree_sequence() - - -def dump_legacy(tree_sequence, filename, version=3): - """ - Writes the specified tree sequence to a HDF5 file in the specified - legacy file format version. - """ - dumpers = { - 2: _dump_legacy_hdf5_v2, - 3: _dump_legacy_hdf5_v3, - 10: _dump_legacy_hdf5_v10, - } - if version not in dumpers: - raise ValueError(f"Version {version} file format is supported") - h5py = get_h5py() - root = h5py.File(filename, "w") - try: - dumpers[version](tree_sequence, root) - finally: - root.close() diff --git a/python/tskit/util.py b/python/tskit/util.py index 834bf6a20a..4fd077f03a 100644 --- a/python/tskit/util.py +++ b/python/tskit/util.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2024 Tskit Developers +# Copyright (c) 2018-2025 Tskit Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -867,7 +867,7 @@ def raise_known_file_format_errors(open_file, existing_exception): "The specified file appears to be in HDF5 format. This file " "may have been generated by msprime < 0.6.0 (June 2018) which " "can no longer be read directly. Please convert to the new " - "kastore format using the ``tskit upgrade`` command." + "kastore format using the ``tskit upgrade`` command from tskit version<0.6.2" ) from existing_exception if header[:2] == b"\x50\x4b": raise tskit.FileFormatError( From b0f4f9af88f1d5041fbc31e8da7df3d1cdc747d5 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 29 Apr 2025 09:40:55 +0100 Subject: [PATCH 2/2] Remove copyright year check as it is flakey on CI --- .pre-commit-config.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d10d98bb9b..d31de96c1c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,10 +7,6 @@ repos: - id: mixed-line-ending - id: check-case-conflict - id: check-yaml - - repo: https://github.com/benjeffery/pre-commit-copyright-year - rev: c62dcbb78f724162e14197f8fa264eaa8c3aad49 - hooks: - - id: copyright-year - repo: https://github.com/benjeffery/pre-commit-clang-format rev: '1.0' hooks: