Skip to content

DRAFT: Add on_demand content to repair_metadata #848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 46 additions & 2 deletions pulp_python/app/tasks/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
from pulpcore.plugin.util import get_domain

from pulp_python.app.models import PythonPackageContent, PythonRepository
from pulp_python.app.utils import artifact_to_python_content_data
from pulp_python.app.utils import (
artifact_to_python_content_data,
remote_artifact_to_python_content_data,
)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -49,22 +52,29 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
"""
# TODO: Add on_demand content repair
immediate_content = content.filter(contentartifact__artifact__isnull=False)
on_demand_content = content.filter(contentartifact__remoteartifact__isnull=False)
# todo: distinct() to avoid duplication?
domain = get_domain()

batch = []
set_of_update_fields = set()
total_repaired = 0
processed_pks = set()

progress_report = ProgressReport(
message="Repairing packages' metadata",
code="repair.metadata",
total=immediate_content.count(),
total=immediate_content.count() + on_demand_content.count(),
)
progress_report.save()
with progress_report:
for package in progress_report.iter(
immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000)
):
if package.pk in processed_pks:
continue
processed_pks.add(package.pk)

new_data = artifact_to_python_content_data(
package.filename, package._artifacts.get(), domain
)
Expand All @@ -82,6 +92,40 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
batch = []
set_of_update_fields.clear()

for package in progress_report.iter(
on_demand_content.prefetch_related(
"contentartifact_set__remoteartifact_set"
).iterator(chunk_size=1000)
):
if package.pk in processed_pks:
continue
processed_pks.add(package.pk)

# todo
for content_artifact in package.contentartifact_set.all():
for ra in content_artifact.remoteartifact_set.all():
remote_artifact = ra
break
if remote_artifact:
break

new_data = remote_artifact_to_python_content_data(
package.filename, remote_artifact, domain
)
changed = False
for field, value in new_data.items():
if getattr(package, field) != value:
setattr(package, field, value)
set_of_update_fields.add(field)
changed = True
if changed:
batch.append(package)
if len(batch) == 1000:
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
batch = []
set_of_update_fields.clear()

if batch:
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
Expand Down
61 changes: 60 additions & 1 deletion pulp_python/app/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import pkginfo
import re
import requests
import shutil
import tempfile
import json
from collections import defaultdict
from django.conf import settings
from jinja2 import Template
from packaging.utils import canonicalize_name
from packaging.utils import canonicalize_name, parse_sdist_filename, parse_wheel_filename
from packaging.requirements import Requirement
from packaging.version import parse, InvalidVersion

Expand Down Expand Up @@ -189,6 +190,64 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
return data


def get_project_metadata_from_pypi_json(package_name, version):
"""
Fetches metadata for a specific version of a given package from PyPI's JSON API.
"""
# todo: fix URL
# https://pypi.org/pypi/scipy/1.1.0/json
# https://fixtures.pulpproject.org/python-pypi/pypi/scipy/1.1.0/json
URL = "https://fixtures.pulpproject.org/python-pypi/"

url = f"{URL}pypi/{package_name}/{version}/json"
# todo: raise if version does not exist
response = requests.get(url, timeout=10)
response.raise_for_status()
data = response.json()

return data["info"]


def get_packagetype_and_python_version(filename):
extensions = list(DIST_EXTENSIONS.keys())
pkg_type_index = [filename.endswith(ext) for ext in extensions].index(True)
packagetype = DIST_EXTENSIONS[extensions[pkg_type_index]]

if packagetype == "sdist":
python_version = "source"
else:
pyver = ""
regex = DIST_REGEXES[extensions[pkg_type_index]]
if bdist_name := regex.match(filename):
pyver = bdist_name.group("pyver") or ""
python_version = pyver

return packagetype, python_version


def remote_artifact_to_python_content_data(filename, remote_artifact, domain=None):
ra_filename = remote_artifact.url.rsplit("/", 1)[-1]
# todo: better handle, more formats?
if ra_filename.endswith(".whl"):
name, version, *_ = parse_wheel_filename(ra_filename)
elif ra_filename.endswith((".tar.gz", ".zip")):
name, version = parse_sdist_filename(ra_filename)

metadata = get_project_metadata_from_pypi_json(name, version)

# todo: rewrite
packagetype, python_version = get_packagetype_and_python_version(filename)
metadata["packagetype"] = packagetype
metadata["python_version"] = python_version

data = parse_project_metadata(metadata)
# data['sha256'] = remote_artifact.sha256
data["filename"] = filename
data["pulp_domain"] = domain or remote_artifact.pulp_domain
data["_pulp_domain"] = data["pulp_domain"]
return data


def python_content_to_json(base_path, content_query, version=None, domain=None):
"""
Converts a QuerySet of PythonPackageContent into the PyPi JSON format
Expand Down
63 changes: 63 additions & 0 deletions pulp_python/tests/functional/api/test_repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,69 @@ def _create(artifact_filename, filename, content_data):
return _create


@pytest.fixture
def create_content_remote(python_bindings):
def _create(filename, r_artifact_url, content_data, remote):
commands = (
"from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; "
"from pulpcore.plugin.util import extract_pk, get_url; "
"from pulp_python.app.models import PythonPackageContent, PythonRemote; "
f"c = PythonPackageContent(filename={filename!r}, **{content_data!r}); "
"c.save(); "
f"ca = ContentArtifact(artifact=None, content=c, relative_path={filename!r}); "
"ca.save(); "
f"remote_obj = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); "
f"ra = RemoteArtifact(content_artifact=ca, remote=remote_obj, url={r_artifact_url!r}); "
"ra.save(); "
"print(get_url(c))"
)
process = subprocess.run(
["pulpcore-manager", "shell", "-c", commands], capture_output=True
)

assert process.returncode == 0
content_href = process.stdout.decode().strip()
return python_bindings.ContentPackagesApi.read(content_href)

return _create


def test_metadata_repair_endpoint_on_demand(
create_content_remote,
monitor_task,
move_to_repository,
python_bindings,
python_remote_factory,
python_repo_factory,
):
python_egg_filename = "scipy-1.1.0.tar.gz"
python_egg_url = urljoin(
urljoin(PYTHON_FIXTURES_URL, "packages/"), python_egg_filename
)
data = {
"name": "scipy",
# Wrong metadata
"author": "ME",
"packagetype": "bdist",
"requires_python": ">=3.8",
"version": "0.2",
}
remote = python_remote_factory(includes=["scipy"])
repo = python_repo_factory(remote=remote)

content = create_content_remote(python_egg_filename, python_egg_url, data, remote)
move_to_repository(repo.pulp_href, [content.pulp_href])

response = python_bindings.RepositoriesPythonApi.repair_metadata(repo.pulp_href)
monitor_task(response.task)

new_content = python_bindings.ContentPackagesApi.read(content.pulp_href)
assert new_content.author == ""
assert new_content.packagetype == "sdist"
assert new_content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
assert new_content.version == "1.1.0"


@pytest.fixture
def move_to_repository(python_bindings, monitor_task):
def _move(repo_href, content_hrefs):
Expand Down