-
Notifications
You must be signed in to change notification settings - Fork 293
feat: delete orphaned files #1958
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9dcb580
e43505c
eed5ea8
8cca600
75b1240
6379480
0c2822e
aaf8fc2
b09641b
beec233
b888c56
ff461ed
3b3b10e
a62c8cf
07cbf1b
54e1e00
7c780d3
9b6c9ed
34d10b9
0335957
9f8145c
fbdcbd3
85b4ab3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,8 @@ | |
from __future__ import annotations | ||
|
||
from datetime import datetime, timezone | ||
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple | ||
from functools import reduce | ||
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple, Union | ||
|
||
from pyiceberg.conversions import from_bytes | ||
from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestFile, PartitionFieldSummary | ||
|
@@ -665,10 +666,20 @@ def data_files(self, snapshot_id: Optional[int] = None) -> "pa.Table": | |
def delete_files(self, snapshot_id: Optional[int] = None) -> "pa.Table": | ||
return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES}) | ||
|
||
def all_manifests(self) -> "pa.Table": | ||
def all_manifests(self, snapshots: Optional[Union[list[Snapshot], list[int]]] = None) -> "pa.Table": | ||
import pyarrow as pa | ||
|
||
snapshots = self.tbl.snapshots() | ||
# coerce into snapshot objects if users passes in snapshot ids | ||
if snapshots is not None: | ||
if isinstance(snapshots[0], int): | ||
snapshots = [ | ||
snapshot | ||
for snapshot_id in snapshots | ||
if (snapshot := self.tbl.metadata.snapshot_by_id(snapshot_id)) is not None | ||
] | ||
else: | ||
snapshots = self.tbl.snapshots() | ||
|
||
if not snapshots: | ||
return pa.Table.from_pylist([], schema=self._get_all_manifests_schema()) | ||
|
||
|
@@ -678,6 +689,32 @@ def all_manifests(self) -> "pa.Table": | |
) | ||
return pa.concat_tables(manifests_by_snapshots) | ||
|
||
def _all_known_files(self) -> dict[str, set[str]]: | ||
"""Get all the known files in the table. | ||
|
||
Returns: | ||
dict of {file_type: set of file paths} for each file type. | ||
""" | ||
snapshots = self.tbl.snapshots() | ||
|
||
_all_known_files = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that #1958 (comment) still applies? Without the current metadata file being tracked here, remove orphaned files can remove a sufficiently old table's metadata file, corrupting it. |
||
_all_known_files["manifests"] = set(self.all_manifests(snapshots)["path"].to_pylist()) | ||
_all_known_files["manifest_lists"] = {snapshot.manifest_list for snapshot in snapshots} | ||
_all_known_files["statistics"] = {statistic.statistics_path for statistic in self.tbl.metadata.statistics} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! Looking at the Java side, there seems to be a focus on (especially in tests) partition statistics files being not deleted by the procedure too. I don't think this is a concern here because PyIceberg doesn't support reading tables with partition statistics in the first place. But I think it should soon (I put up #2034 / #2033) in which case this logic would have to include that. |
||
|
||
metadata_files = {entry.metadata_file for entry in self.tbl.metadata.metadata_log} | ||
metadata_files.add(self.tbl.metadata_location) # Include current metadata file | ||
_all_known_files["metadata"] = metadata_files | ||
|
||
executor = ExecutorFactory.get_or_create() | ||
snapshot_ids = [snapshot.snapshot_id for snapshot in snapshots] | ||
files_by_snapshots: Iterator[Set[str]] = executor.map( | ||
lambda snapshot_id: set(self.files(snapshot_id)["file_path"].to_pylist()), snapshot_ids | ||
) | ||
_all_known_files["datafiles"] = reduce(set.union, files_by_snapshots, set()) | ||
|
||
return _all_known_files | ||
|
||
def _all_files(self, data_file_filter: Optional[Set[DataFileContent]] = None) -> "pa.Table": | ||
import pyarrow as pa | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
from __future__ import annotations | ||
|
||
import logging | ||
from datetime import datetime, timedelta, timezone | ||
from functools import reduce | ||
from typing import TYPE_CHECKING, Set | ||
|
||
from pyiceberg.io import _parse_location | ||
from pyiceberg.utils.concurrent import ExecutorFactory | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
if TYPE_CHECKING: | ||
from pyiceberg.table import Table | ||
|
||
|
||
class MaintenanceTable: | ||
tbl: Table | ||
|
||
def __init__(self, tbl: Table) -> None: | ||
self.tbl = tbl | ||
|
||
try: | ||
import pyarrow as pa # noqa | ||
except ModuleNotFoundError as e: | ||
raise ModuleNotFoundError("For metadata operations PyArrow needs to be installed") from e | ||
|
||
def _orphaned_files(self, location: str, older_than: timedelta = timedelta(days=3)) -> Set[str]: | ||
"""Get all files which are not referenced in any metadata files of an Iceberg table and can thus be considered "orphaned". | ||
|
||
Args: | ||
location: The location to check for orphaned files. | ||
older_than: The time period to check for orphaned files. Defaults to 3 days. | ||
|
||
Returns: | ||
A set of orphaned file paths. | ||
""" | ||
try: | ||
import pyarrow as pa # noqa: F401 | ||
except ModuleNotFoundError as e: | ||
raise ModuleNotFoundError("For deleting orphaned files PyArrow needs to be installed") from e | ||
|
||
from pyarrow.fs import FileSelector, FileType | ||
|
||
from pyiceberg.io.pyarrow import _fs_from_file_path | ||
|
||
all_known_files = self.tbl.inspect._all_known_files() | ||
flat_known_files: set[str] = reduce(set.union, all_known_files.values(), set()) | ||
|
||
fs = _fs_from_file_path(self.tbl.io, location) | ||
|
||
_, _, path = _parse_location(location) | ||
selector = FileSelector(path, recursive=True) | ||
# filter to just files as it may return directories, and filter on time | ||
as_of = datetime.now(timezone.utc) - older_than | ||
all_files = [ | ||
f.path for f in fs.get_file_info(selector) if f.type == FileType.File and (as_of is None or (f.mtime < as_of)) | ||
] | ||
|
||
orphaned_files = set(all_files).difference(flat_known_files) | ||
|
||
return orphaned_files | ||
|
||
def remove_orphaned_files(self, older_than: timedelta = timedelta(days=3), dry_run: bool = False) -> None: | ||
"""Remove files which are not referenced in any metadata files of an Iceberg table and can thus be considered "orphaned". | ||
|
||
Args: | ||
older_than: The time period to check for orphaned files. Defaults to 3 days. | ||
dry_run: If True, only log the files that would be deleted. Defaults to False. | ||
""" | ||
location = self.tbl.location() | ||
orphaned_files = self._orphaned_files(location, older_than) | ||
logger.info(f"Found {len(orphaned_files)} orphaned files at {location}!") | ||
deleted_files = set() | ||
failed_to_delete_files = set() | ||
|
||
def _delete(file: str) -> None: | ||
# don't error if the file doesn't exist | ||
# still catch ctrl-c, etc. | ||
try: | ||
self.tbl.io.delete(file) | ||
deleted_files.add(file) | ||
except Exception: | ||
failed_to_delete_files.add(file) | ||
|
||
if orphaned_files: | ||
if dry_run: | ||
logger.info(f"(Dry Run) Deleted {len(orphaned_files)} orphaned files at {location}!") | ||
else: | ||
executor = ExecutorFactory.get_or_create() | ||
deletes = executor.map(_delete, orphaned_files) | ||
# exhaust | ||
list(deletes) | ||
logger.info(f"Deleted {len(deleted_files)} orphaned files at {location}!") | ||
logger.info(f"Files:\n{deleted_files}") | ||
if failed_to_delete_files: | ||
logger.warning(f"Failed to delete {len(failed_to_delete_files)} orphaned files at {location}!") | ||
logger.warning(f"Files:\n{failed_to_delete_files}") | ||
else: | ||
logger.info(f"No orphaned files found at {location}!") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
import os | ||
from datetime import datetime, timedelta | ||
from pathlib import Path, PosixPath | ||
from unittest.mock import patch | ||
|
||
import pyarrow as pa | ||
import pytest | ||
|
||
from pyiceberg.catalog import Catalog | ||
from pyiceberg.schema import Schema | ||
from pyiceberg.types import IntegerType, NestedField, StringType | ||
from tests.catalog.test_base import InMemoryCatalog | ||
|
||
|
||
@pytest.fixture | ||
def catalog(tmp_path: PosixPath) -> InMemoryCatalog: | ||
catalog = InMemoryCatalog("test.in_memory.catalog", warehouse=tmp_path.absolute().as_posix()) | ||
catalog.create_namespace("default") | ||
return catalog | ||
|
||
|
||
def test_remove_orphaned_files(catalog: Catalog) -> None: | ||
identifier = "default.test_remove_orphaned_files" | ||
|
||
schema = Schema( | ||
NestedField(1, "city", StringType(), required=True), | ||
NestedField(2, "inhabitants", IntegerType(), required=True), | ||
# Mark City as the identifier field, also known as the primary-key | ||
identifier_field_ids=[1], | ||
) | ||
|
||
tbl = catalog.create_table(identifier, schema=schema) | ||
|
||
arrow_schema = pa.schema( | ||
[ | ||
pa.field("city", pa.string(), nullable=False), | ||
pa.field("inhabitants", pa.int32(), nullable=False), | ||
] | ||
) | ||
|
||
df = pa.Table.from_pylist( | ||
[ | ||
{"city": "Drachten", "inhabitants": 45019}, | ||
{"city": "Drachten", "inhabitants": 45019}, | ||
], | ||
schema=arrow_schema, | ||
) | ||
tbl.append(df) | ||
|
||
orphaned_file = Path(tbl.location()) / "orphan.txt" | ||
|
||
orphaned_file.touch() | ||
assert orphaned_file.exists() | ||
|
||
# assert no files deleted if dry run... | ||
tbl.maintenance.remove_orphaned_files(dry_run=True) | ||
assert orphaned_file.exists() | ||
|
||
# should not delete because it was just created... | ||
tbl.maintenance.remove_orphaned_files() | ||
assert orphaned_file.exists() | ||
|
||
# modify creation date to be older than 3 days | ||
five_days_ago = (datetime.now() - timedelta(days=5)).timestamp() | ||
os.utime(orphaned_file, (five_days_ago, five_days_ago)) | ||
tbl.maintenance.remove_orphaned_files() | ||
assert not orphaned_file.exists() | ||
|
||
# assert that all known files still exist... | ||
all_known_files = tbl.inspect._all_known_files() | ||
for files in all_known_files.values(): | ||
for file in files: | ||
assert Path(file).exists() | ||
|
||
|
||
def test_remove_orphaned_files_with_invalid_file_doesnt_error(catalog: Catalog) -> None: | ||
identifier = "default.test_remove_orphaned_files" | ||
|
||
schema = Schema( | ||
NestedField(1, "city", StringType(), required=True), | ||
NestedField(2, "inhabitants", IntegerType(), required=True), | ||
# Mark City as the identifier field, also known as the primary-key | ||
identifier_field_ids=[1], | ||
) | ||
|
||
tbl = catalog.create_table(identifier, schema=schema) | ||
|
||
arrow_schema = pa.schema( | ||
[ | ||
pa.field("city", pa.string(), nullable=False), | ||
pa.field("inhabitants", pa.int32(), nullable=False), | ||
] | ||
) | ||
|
||
df = pa.Table.from_pylist( | ||
[ | ||
{"city": "Drachten", "inhabitants": 45019}, | ||
{"city": "Drachten", "inhabitants": 45019}, | ||
], | ||
schema=arrow_schema, | ||
) | ||
tbl.append(df) | ||
|
||
file_that_does_not_exist = "foo/bar.baz" | ||
with patch.object(type(tbl.maintenance), "orphaned_files", return_value={file_that_does_not_exist}): | ||
with patch.object(tbl.io, "delete", wraps=tbl.io.delete) as mock_delete: | ||
tbl.maintenance.remove_orphaned_files(timedelta(days=3)) | ||
mock_delete.assert_called_with(file_that_does_not_exist) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still need this now we have
all_files
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I dont think that
all_files
includes manifest lists or statistics files right?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should reference deletes, but thinking of it, this is probably much more efficient because it does not pull in all the other data.