diff --git a/pulp_python/app/tasks/repair.py b/pulp_python/app/tasks/repair.py index c1fa6a71..b2dcfc39 100644 --- a/pulp_python/app/tasks/repair.py +++ b/pulp_python/app/tasks/repair.py @@ -7,7 +7,10 @@ from pulpcore.plugin.util import get_domain from pulp_python.app.models import PythonPackageContent, PythonRepository -from pulp_python.app.utils import artifact_to_python_content_data +from pulp_python.app.utils import ( + artifact_to_python_content_data, + remote_artifact_to_python_content_data, +) log = logging.getLogger(__name__) @@ -49,22 +52,29 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int: """ # TODO: Add on_demand content repair immediate_content = content.filter(contentartifact__artifact__isnull=False) + on_demand_content = content.filter(contentartifact__remoteartifact__isnull=False) + # todo: distinct() to avoid duplication? domain = get_domain() batch = [] set_of_update_fields = set() total_repaired = 0 + processed_pks = set() progress_report = ProgressReport( message="Repairing packages' metadata", code="repair.metadata", - total=immediate_content.count(), + total=immediate_content.count() + on_demand_content.count(), ) progress_report.save() with progress_report: for package in progress_report.iter( immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000) ): + if package.pk in processed_pks: + continue + processed_pks.add(package.pk) + new_data = artifact_to_python_content_data( package.filename, package._artifacts.get(), domain ) @@ -82,6 +92,40 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int: batch = [] set_of_update_fields.clear() + for package in progress_report.iter( + on_demand_content.prefetch_related( + "contentartifact_set__remoteartifact_set" + ).iterator(chunk_size=1000) + ): + if package.pk in processed_pks: + continue + processed_pks.add(package.pk) + + # todo + for content_artifact in package.contentartifact_set.all(): + for ra in content_artifact.remoteartifact_set.all(): + remote_artifact = ra + break + if remote_artifact: + break + + new_data = remote_artifact_to_python_content_data( + package.filename, remote_artifact, domain + ) + changed = False + for field, value in new_data.items(): + if getattr(package, field) != value: + setattr(package, field, value) + set_of_update_fields.add(field) + changed = True + if changed: + batch.append(package) + if len(batch) == 1000: + total_repaired += len(batch) + PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) + batch = [] + set_of_update_fields.clear() + if batch: total_repaired += len(batch) PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index 8baf0e9d..01444ede 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -1,12 +1,13 @@ import pkginfo import re +import requests import shutil import tempfile import json from collections import defaultdict from django.conf import settings from jinja2 import Template -from packaging.utils import canonicalize_name +from packaging.utils import canonicalize_name, parse_sdist_filename, parse_wheel_filename from packaging.requirements import Requirement from packaging.version import parse, InvalidVersion @@ -189,6 +190,64 @@ def artifact_to_python_content_data(filename, artifact, domain=None): return data +def get_project_metadata_from_pypi_json(package_name, version): + """ + Fetches metadata for a specific version of a given package from PyPI's JSON API. + """ + # todo: fix URL + # https://pypi.org/pypi/scipy/1.1.0/json + # https://fixtures.pulpproject.org/python-pypi/pypi/scipy/1.1.0/json + URL = "https://fixtures.pulpproject.org/python-pypi/" + + url = f"{URL}pypi/{package_name}/{version}/json" + # todo: raise if version does not exist + response = requests.get(url, timeout=10) + response.raise_for_status() + data = response.json() + + return data["info"] + + +def get_packagetype_and_python_version(filename): + extensions = list(DIST_EXTENSIONS.keys()) + pkg_type_index = [filename.endswith(ext) for ext in extensions].index(True) + packagetype = DIST_EXTENSIONS[extensions[pkg_type_index]] + + if packagetype == "sdist": + python_version = "source" + else: + pyver = "" + regex = DIST_REGEXES[extensions[pkg_type_index]] + if bdist_name := regex.match(filename): + pyver = bdist_name.group("pyver") or "" + python_version = pyver + + return packagetype, python_version + + +def remote_artifact_to_python_content_data(filename, remote_artifact, domain=None): + ra_filename = remote_artifact.url.rsplit("/", 1)[-1] + # todo: better handle, more formats? + if ra_filename.endswith(".whl"): + name, version, *_ = parse_wheel_filename(ra_filename) + elif ra_filename.endswith((".tar.gz", ".zip")): + name, version = parse_sdist_filename(ra_filename) + + metadata = get_project_metadata_from_pypi_json(name, version) + + # todo: rewrite + packagetype, python_version = get_packagetype_and_python_version(filename) + metadata["packagetype"] = packagetype + metadata["python_version"] = python_version + + data = parse_project_metadata(metadata) + # data['sha256'] = remote_artifact.sha256 + data["filename"] = filename + data["pulp_domain"] = domain or remote_artifact.pulp_domain + data["_pulp_domain"] = data["pulp_domain"] + return data + + def python_content_to_json(base_path, content_query, version=None, domain=None): """ Converts a QuerySet of PythonPackageContent into the PyPi JSON format diff --git a/pulp_python/tests/functional/api/test_repair.py b/pulp_python/tests/functional/api/test_repair.py index 4b2bce55..5686c49c 100644 --- a/pulp_python/tests/functional/api/test_repair.py +++ b/pulp_python/tests/functional/api/test_repair.py @@ -32,6 +32,69 @@ def _create(artifact_filename, filename, content_data): return _create +@pytest.fixture +def create_content_remote(python_bindings): + def _create(filename, r_artifact_url, content_data, remote): + commands = ( + "from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; " + "from pulpcore.plugin.util import extract_pk, get_url; " + "from pulp_python.app.models import PythonPackageContent, PythonRemote; " + f"c = PythonPackageContent(filename={filename!r}, **{content_data!r}); " + "c.save(); " + f"ca = ContentArtifact(artifact=None, content=c, relative_path={filename!r}); " + "ca.save(); " + f"remote_obj = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); " + f"ra = RemoteArtifact(content_artifact=ca, remote=remote_obj, url={r_artifact_url!r}); " + "ra.save(); " + "print(get_url(c))" + ) + process = subprocess.run( + ["pulpcore-manager", "shell", "-c", commands], capture_output=True + ) + + assert process.returncode == 0 + content_href = process.stdout.decode().strip() + return python_bindings.ContentPackagesApi.read(content_href) + + return _create + + +def test_metadata_repair_endpoint_on_demand( + create_content_remote, + monitor_task, + move_to_repository, + python_bindings, + python_remote_factory, + python_repo_factory, +): + python_egg_filename = "scipy-1.1.0.tar.gz" + python_egg_url = urljoin( + urljoin(PYTHON_FIXTURES_URL, "packages/"), python_egg_filename + ) + data = { + "name": "scipy", + # Wrong metadata + "author": "ME", + "packagetype": "bdist", + "requires_python": ">=3.8", + "version": "0.2", + } + remote = python_remote_factory(includes=["scipy"]) + repo = python_repo_factory(remote=remote) + + content = create_content_remote(python_egg_filename, python_egg_url, data, remote) + move_to_repository(repo.pulp_href, [content.pulp_href]) + + response = python_bindings.RepositoriesPythonApi.repair_metadata(repo.pulp_href) + monitor_task(response.task) + + new_content = python_bindings.ContentPackagesApi.read(content.pulp_href) + assert new_content.author == "" + assert new_content.packagetype == "sdist" + assert new_content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" + assert new_content.version == "1.1.0" + + @pytest.fixture def move_to_repository(python_bindings, monitor_task): def _move(repo_href, content_hrefs):