Skip to content

Add utitlity to lazily acquire wheel metadata over HTTP #8467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
27 changes: 2 additions & 25 deletions src/pip/_internal/network/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pip._internal.cli.progress_bars import DownloadProgressProvider
from pip._internal.models.index import PyPI
from pip._internal.network.cache import is_from_cache
from pip._internal.network.utils import response_chunks
from pip._internal.network.utils import HEADERS, response_chunks
from pip._internal.utils.misc import (
format_size,
redact_auth_from_url,
Expand Down Expand Up @@ -132,30 +132,7 @@ def _get_http_response_filename(resp, link):
def _http_get_download(session, link):
# type: (PipSession, Link) -> Response
target_url = link.url.split('#', 1)[0]
resp = session.get(
target_url,
# We use Accept-Encoding: identity here because requests
# defaults to accepting compressed responses. This breaks in
# a variety of ways depending on how the server is configured.
# - Some servers will notice that the file isn't a compressible
# file and will leave the file alone and with an empty
# Content-Encoding
# - Some servers will notice that the file is already
# compressed and will leave the file alone and will add a
# Content-Encoding: gzip header
# - Some servers won't notice anything at all and will take
# a file that's already been compressed and compress it again
# and set the Content-Encoding: gzip header
# By setting this to request only the identity encoding We're
# hoping to eliminate the third case. Hopefully there does not
# exist a server which when given a file will notice it is
# already compressed and that you're not asking for a
# compressed file and will then decompress it before sending
# because if that's the case I don't think it'll ever be
# possible to make this work.
headers={"Accept-Encoding": "identity"},
stream=True,
)
resp = session.get(target_url, headers=HEADERS, stream=True)
resp.raise_for_status()
return resp

Expand Down
221 changes: 221 additions & 0 deletions src/pip/_internal/network/lazy_wheel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
"""Lazy ZIP over HTTP"""

__all__ = ['dist_from_wheel_url']

from bisect import bisect_left, bisect_right
from contextlib import contextmanager
from tempfile import NamedTemporaryFile
from zipfile import BadZipfile, ZipFile

from pip._vendor.requests.models import CONTENT_CHUNK_SIZE
from pip._vendor.six.moves import range

from pip._internal.network.utils import HEADERS, response_chunks
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
from pip._internal.utils.wheel import pkg_resources_distribution_for_wheel

if MYPY_CHECK_RUNNING:
from typing import Any, Dict, Iterator, List, Optional, Tuple

from pip._vendor.pkg_resources import Distribution
from pip._vendor.requests.models import Response

from pip._internal.network.session import PipSession


def dist_from_wheel_url(name, url, session):
# type: (str, str, PipSession) -> Distribution
"""Return a pkg_resources.Distribution from the given wheel URL.

This uses HTTP range requests to only fetch the potion of the wheel
containing metadata, just enough for the object to be constructed.
If such requests are not supported, RuntimeError is raised.
"""
with LazyZipOverHTTP(url, session) as wheel:
# For read-only ZIP files, ZipFile only needs methods read,
# seek, seekable and tell, not the whole IO protocol.
zip_file = ZipFile(wheel) # type: ignore
# After context manager exit, wheel.name
# is an invalid file by intention.
return pkg_resources_distribution_for_wheel(zip_file, name, wheel.name)


class LazyZipOverHTTP(object):
"""File-like object mapped to a ZIP file over HTTP.

This uses HTTP range requests to lazily fetch the file's content,
which is supposed to be fed to ZipFile. If such requests are not
supported by the server, raise RuntimeError during initialization.
"""

def __init__(self, url, session, chunk_size=CONTENT_CHUNK_SIZE):
# type: (str, PipSession, int) -> None
head = session.head(url, headers=HEADERS)
head.raise_for_status()
assert head.status_code == 200
self._session, self._url, self._chunk_size = session, url, chunk_size
self._length = int(head.headers['Content-Length'])
self._file = NamedTemporaryFile()
self.truncate(self._length)
self._left = [] # type: List[int]
self._right = [] # type: List[int]
if 'bytes' not in head.headers.get('Accept-Ranges', 'none'):
raise RuntimeError('range request is not supported')
self._check_zip()

@property
def mode(self):
# type: () -> str
"""Opening mode, which is always rb."""
return 'rb'

@property
def name(self):
# type: () -> str
"""Path to the underlying file."""
return self._file.name

def seekable(self):
# type: () -> bool
"""Return whether random access is supported, which is True."""
return True

def close(self):
# type: () -> None
"""Close the file."""
self._file.close()

@property
def closed(self):
# type: () -> bool
"""Whether the file is closed."""
return self._file.closed

def read(self, size=-1):
# type: (int) -> bytes
"""Read up to size bytes from the object and return them.

As a convenience, if size is unspecified or -1,
all bytes until EOF are returned. Fewer than
size bytes may be returned if EOF is reached.
"""
start, length = self.tell(), self._length
stop = start + size if 0 <= size <= length-start else length
self._download(start, stop-1)
return self._file.read(size)

def readable(self):
# type: () -> bool
"""Return whether the file is readable, which is True."""
return True

def seek(self, offset, whence=0):
# type: (int, int) -> int
"""Change stream position and return the new absolute position.

Seek to offset relative position indicated by whence:
* 0: Start of stream (the default). pos should be >= 0;
* 1: Current position - pos may be negative;
* 2: End of stream - pos usually negative.
"""
return self._file.seek(offset, whence)

def tell(self):
# type: () -> int
"""Return the current possition."""
return self._file.tell()

def truncate(self, size=None):
# type: (Optional[int]) -> int
"""Resize the stream to the given size in bytes.

If size is unspecified resize to the current position.
The current stream position isn't changed.

Return the new file size.
"""
return self._file.truncate(size)

def writable(self):
# type: () -> bool
"""Return False."""
return False

def __enter__(self):
# type: () -> LazyZipOverHTTP
self._file.__enter__()
return self

def __exit__(self, *exc):
# type: (*Any) -> Optional[bool]
return self._file.__exit__(*exc)

@contextmanager
def _stay(self):
# type: ()-> Iterator[None]
"""Return a context manager keeping the position.

At the end of the block, seek back to original position.
"""
pos = self.tell()
try:
yield
finally:
self.seek(pos)

def _check_zip(self):
# type: () -> None
"""Check and download until the file is a valid ZIP."""
end = self._length - 1
for start in reversed(range(0, end, self._chunk_size)):
self._download(start, end)
with self._stay():
try:
# For read-only ZIP files, ZipFile only needs
# methods read, seek, seekable and tell.
ZipFile(self) # type: ignore
except BadZipfile:
pass
else:
break

def _stream_response(self, start, end, base_headers=HEADERS):
# type: (int, int, Dict[str, str]) -> Response
"""Return HTTP response to a range request from start to end."""
headers = {'Range': 'bytes={}-{}'.format(start, end)}
headers.update(base_headers)
return self._session.get(self._url, headers=headers, stream=True)

def _merge(self, start, end, left, right):
# type: (int, int, int, int) -> Iterator[Tuple[int, int]]
"""Return an iterator of intervals to be fetched.

Args:
start (int): Start of needed interval
end (int): End of needed interval
left (int): Index of first overlapping downloaded data
right (int): Index after last overlapping downloaded data
"""
lslice, rslice = self._left[left:right], self._right[left:right]
i = start = min([start]+lslice[:1])
end = max([end]+rslice[-1:])
for j, k in zip(lslice, rslice):
if j > i:
yield i, j-1
i = k + 1
if i <= end:
yield i, end
self._left[left:right], self._right[left:right] = [start], [end]

def _download(self, start, end):
# type: (int, int) -> None
"""Download bytes from start to end inclusively."""
with self._stay():
left = bisect_left(self._right, start)
right = bisect_right(self._left, end)
for start, end in self._merge(start, end, left, right):
response = self._stream_response(start, end)
response.raise_for_status()
self.seek(start)
for chunk in response_chunks(response, self._chunk_size):
self._file.write(chunk)
23 changes: 22 additions & 1 deletion src/pip/_internal/network/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,28 @@
from pip._internal.utils.typing import MYPY_CHECK_RUNNING

if MYPY_CHECK_RUNNING:
from typing import Iterator
from typing import Dict, Iterator

# The following comments and HTTP headers were originally added by
# Donald Stufft in git commit 22c562429a61bb77172039e480873fb239dd8c03.
#
# We use Accept-Encoding: identity here because requests defaults to
# accepting compressed responses. This breaks in a variety of ways
# depending on how the server is configured.
# - Some servers will notice that the file isn't a compressible file
# and will leave the file alone and with an empty Content-Encoding
# - Some servers will notice that the file is already compressed and
# will leave the file alone, adding a Content-Encoding: gzip header
# - Some servers won't notice anything at all and will take a file
# that's already been compressed and compress it again, and set
# the Content-Encoding: gzip header
# By setting this to request only the identity encoding we're hoping
# to eliminate the third case. Hopefully there does not exist a server
# which when given a file will notice it is already compressed and that
# you're not asking for a compressed file and will then decompress it
# before sending because if that's the case I don't think it'll ever be
# possible to make this work.
HEADERS = {'Accept-Encoding': 'identity'} # type: Dict[str, str]


def response_chunks(response, chunk_size=CONTENT_CHUNK_SIZE):
Expand Down
2 changes: 1 addition & 1 deletion tests/lib/requests_mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, contents):
self.status_code = 200
self.connection = None
self.url = None
self.headers = {}
self.headers = {'Content-Length': len(contents)}
self.history = []

def raise_for_status(self):
Expand Down
50 changes: 50 additions & 0 deletions tests/unit/test_network_lazy_wheel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from zipfile import BadZipfile

from pip._vendor.pkg_resources import Requirement
from pytest import fixture, mark, raises

from pip._internal.network.lazy_wheel import dist_from_wheel_url
from pip._internal.network.session import PipSession
from tests.lib.requests_mocks import MockResponse

MYPY_0_782_WHL = (
'https://files.pythonhosted.org/packages/9d/65/'
'b96e844150ce18b9892b155b780248955ded13a2581d31872e7daa90a503/'
'mypy-0.782-py3-none-any.whl'
)
MYPY_0_782_REQS = {
Requirement('typed-ast (<1.5.0,>=1.4.0)'),
Requirement('typing-extensions (>=3.7.4)'),
Requirement('mypy-extensions (<0.5.0,>=0.4.3)'),
Requirement('psutil (>=4.0); extra == "dmypy"'),
}


@fixture
def session():
return PipSession()


@mark.network
def test_dist_from_wheel_url(session):
"""Test if the acquired distribution contain correct information."""
dist = dist_from_wheel_url('mypy', MYPY_0_782_WHL, session)
assert dist.key == 'mypy'
assert dist.version == '0.782'
assert dist.extras == ['dmypy']
assert set(dist.requires(dist.extras)) == MYPY_0_782_REQS


@mark.network
def test_dist_from_wheel_url_no_range(session, monkeypatch):
"""Test handling when HTTP range requests are not supported."""
monkeypatch.setattr(session, 'head', lambda *a, **kw: MockResponse(b''))
with raises(RuntimeError):
dist_from_wheel_url('mypy', MYPY_0_782_WHL, session)


@mark.network
def test_dist_from_wheel_url_not_zip(session):
"""Test handling with the given URL does not point to a ZIP."""
with raises(BadZipfile):
dist_from_wheel_url('python', 'https://www.python.org/', session)