Skip to content

Commit 18431be

Browse files
authored
Merge pull request #8467 from McSinyx/lazy-wheel
2 parents 93b0683 + 25a25a0 commit 18431be

File tree

6 files changed

+296
-27
lines changed

6 files changed

+296
-27
lines changed

news/70727978-e22a-427d-aa03-11ce55d8f6f9.trivial

Whitespace-only changes.

src/pip/_internal/network/download.py

+2-25
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pip._internal.cli.progress_bars import DownloadProgressProvider
1212
from pip._internal.models.index import PyPI
1313
from pip._internal.network.cache import is_from_cache
14-
from pip._internal.network.utils import response_chunks
14+
from pip._internal.network.utils import HEADERS, response_chunks
1515
from pip._internal.utils.misc import (
1616
format_size,
1717
redact_auth_from_url,
@@ -132,30 +132,7 @@ def _get_http_response_filename(resp, link):
132132
def _http_get_download(session, link):
133133
# type: (PipSession, Link) -> Response
134134
target_url = link.url.split('#', 1)[0]
135-
resp = session.get(
136-
target_url,
137-
# We use Accept-Encoding: identity here because requests
138-
# defaults to accepting compressed responses. This breaks in
139-
# a variety of ways depending on how the server is configured.
140-
# - Some servers will notice that the file isn't a compressible
141-
# file and will leave the file alone and with an empty
142-
# Content-Encoding
143-
# - Some servers will notice that the file is already
144-
# compressed and will leave the file alone and will add a
145-
# Content-Encoding: gzip header
146-
# - Some servers won't notice anything at all and will take
147-
# a file that's already been compressed and compress it again
148-
# and set the Content-Encoding: gzip header
149-
# By setting this to request only the identity encoding We're
150-
# hoping to eliminate the third case. Hopefully there does not
151-
# exist a server which when given a file will notice it is
152-
# already compressed and that you're not asking for a
153-
# compressed file and will then decompress it before sending
154-
# because if that's the case I don't think it'll ever be
155-
# possible to make this work.
156-
headers={"Accept-Encoding": "identity"},
157-
stream=True,
158-
)
135+
resp = session.get(target_url, headers=HEADERS, stream=True)
159136
resp.raise_for_status()
160137
return resp
161138

+221
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
"""Lazy ZIP over HTTP"""
2+
3+
__all__ = ['dist_from_wheel_url']
4+
5+
from bisect import bisect_left, bisect_right
6+
from contextlib import contextmanager
7+
from tempfile import NamedTemporaryFile
8+
from zipfile import BadZipfile, ZipFile
9+
10+
from pip._vendor.requests.models import CONTENT_CHUNK_SIZE
11+
from pip._vendor.six.moves import range
12+
13+
from pip._internal.network.utils import HEADERS, response_chunks
14+
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
15+
from pip._internal.utils.wheel import pkg_resources_distribution_for_wheel
16+
17+
if MYPY_CHECK_RUNNING:
18+
from typing import Any, Dict, Iterator, List, Optional, Tuple
19+
20+
from pip._vendor.pkg_resources import Distribution
21+
from pip._vendor.requests.models import Response
22+
23+
from pip._internal.network.session import PipSession
24+
25+
26+
def dist_from_wheel_url(name, url, session):
27+
# type: (str, str, PipSession) -> Distribution
28+
"""Return a pkg_resources.Distribution from the given wheel URL.
29+
30+
This uses HTTP range requests to only fetch the potion of the wheel
31+
containing metadata, just enough for the object to be constructed.
32+
If such requests are not supported, RuntimeError is raised.
33+
"""
34+
with LazyZipOverHTTP(url, session) as wheel:
35+
# For read-only ZIP files, ZipFile only needs methods read,
36+
# seek, seekable and tell, not the whole IO protocol.
37+
zip_file = ZipFile(wheel) # type: ignore
38+
# After context manager exit, wheel.name
39+
# is an invalid file by intention.
40+
return pkg_resources_distribution_for_wheel(zip_file, name, wheel.name)
41+
42+
43+
class LazyZipOverHTTP(object):
44+
"""File-like object mapped to a ZIP file over HTTP.
45+
46+
This uses HTTP range requests to lazily fetch the file's content,
47+
which is supposed to be fed to ZipFile. If such requests are not
48+
supported by the server, raise RuntimeError during initialization.
49+
"""
50+
51+
def __init__(self, url, session, chunk_size=CONTENT_CHUNK_SIZE):
52+
# type: (str, PipSession, int) -> None
53+
head = session.head(url, headers=HEADERS)
54+
head.raise_for_status()
55+
assert head.status_code == 200
56+
self._session, self._url, self._chunk_size = session, url, chunk_size
57+
self._length = int(head.headers['Content-Length'])
58+
self._file = NamedTemporaryFile()
59+
self.truncate(self._length)
60+
self._left = [] # type: List[int]
61+
self._right = [] # type: List[int]
62+
if 'bytes' not in head.headers.get('Accept-Ranges', 'none'):
63+
raise RuntimeError('range request is not supported')
64+
self._check_zip()
65+
66+
@property
67+
def mode(self):
68+
# type: () -> str
69+
"""Opening mode, which is always rb."""
70+
return 'rb'
71+
72+
@property
73+
def name(self):
74+
# type: () -> str
75+
"""Path to the underlying file."""
76+
return self._file.name
77+
78+
def seekable(self):
79+
# type: () -> bool
80+
"""Return whether random access is supported, which is True."""
81+
return True
82+
83+
def close(self):
84+
# type: () -> None
85+
"""Close the file."""
86+
self._file.close()
87+
88+
@property
89+
def closed(self):
90+
# type: () -> bool
91+
"""Whether the file is closed."""
92+
return self._file.closed
93+
94+
def read(self, size=-1):
95+
# type: (int) -> bytes
96+
"""Read up to size bytes from the object and return them.
97+
98+
As a convenience, if size is unspecified or -1,
99+
all bytes until EOF are returned. Fewer than
100+
size bytes may be returned if EOF is reached.
101+
"""
102+
start, length = self.tell(), self._length
103+
stop = start + size if 0 <= size <= length-start else length
104+
self._download(start, stop-1)
105+
return self._file.read(size)
106+
107+
def readable(self):
108+
# type: () -> bool
109+
"""Return whether the file is readable, which is True."""
110+
return True
111+
112+
def seek(self, offset, whence=0):
113+
# type: (int, int) -> int
114+
"""Change stream position and return the new absolute position.
115+
116+
Seek to offset relative position indicated by whence:
117+
* 0: Start of stream (the default). pos should be >= 0;
118+
* 1: Current position - pos may be negative;
119+
* 2: End of stream - pos usually negative.
120+
"""
121+
return self._file.seek(offset, whence)
122+
123+
def tell(self):
124+
# type: () -> int
125+
"""Return the current possition."""
126+
return self._file.tell()
127+
128+
def truncate(self, size=None):
129+
# type: (Optional[int]) -> int
130+
"""Resize the stream to the given size in bytes.
131+
132+
If size is unspecified resize to the current position.
133+
The current stream position isn't changed.
134+
135+
Return the new file size.
136+
"""
137+
return self._file.truncate(size)
138+
139+
def writable(self):
140+
# type: () -> bool
141+
"""Return False."""
142+
return False
143+
144+
def __enter__(self):
145+
# type: () -> LazyZipOverHTTP
146+
self._file.__enter__()
147+
return self
148+
149+
def __exit__(self, *exc):
150+
# type: (*Any) -> Optional[bool]
151+
return self._file.__exit__(*exc)
152+
153+
@contextmanager
154+
def _stay(self):
155+
# type: ()-> Iterator[None]
156+
"""Return a context manager keeping the position.
157+
158+
At the end of the block, seek back to original position.
159+
"""
160+
pos = self.tell()
161+
try:
162+
yield
163+
finally:
164+
self.seek(pos)
165+
166+
def _check_zip(self):
167+
# type: () -> None
168+
"""Check and download until the file is a valid ZIP."""
169+
end = self._length - 1
170+
for start in reversed(range(0, end, self._chunk_size)):
171+
self._download(start, end)
172+
with self._stay():
173+
try:
174+
# For read-only ZIP files, ZipFile only needs
175+
# methods read, seek, seekable and tell.
176+
ZipFile(self) # type: ignore
177+
except BadZipfile:
178+
pass
179+
else:
180+
break
181+
182+
def _stream_response(self, start, end, base_headers=HEADERS):
183+
# type: (int, int, Dict[str, str]) -> Response
184+
"""Return HTTP response to a range request from start to end."""
185+
headers = {'Range': 'bytes={}-{}'.format(start, end)}
186+
headers.update(base_headers)
187+
return self._session.get(self._url, headers=headers, stream=True)
188+
189+
def _merge(self, start, end, left, right):
190+
# type: (int, int, int, int) -> Iterator[Tuple[int, int]]
191+
"""Return an iterator of intervals to be fetched.
192+
193+
Args:
194+
start (int): Start of needed interval
195+
end (int): End of needed interval
196+
left (int): Index of first overlapping downloaded data
197+
right (int): Index after last overlapping downloaded data
198+
"""
199+
lslice, rslice = self._left[left:right], self._right[left:right]
200+
i = start = min([start]+lslice[:1])
201+
end = max([end]+rslice[-1:])
202+
for j, k in zip(lslice, rslice):
203+
if j > i:
204+
yield i, j-1
205+
i = k + 1
206+
if i <= end:
207+
yield i, end
208+
self._left[left:right], self._right[left:right] = [start], [end]
209+
210+
def _download(self, start, end):
211+
# type: (int, int) -> None
212+
"""Download bytes from start to end inclusively."""
213+
with self._stay():
214+
left = bisect_left(self._right, start)
215+
right = bisect_right(self._left, end)
216+
for start, end in self._merge(start, end, left, right):
217+
response = self._stream_response(start, end)
218+
response.raise_for_status()
219+
self.seek(start)
220+
for chunk in response_chunks(response, self._chunk_size):
221+
self._file.write(chunk)

src/pip/_internal/network/utils.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,28 @@
33
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
44

55
if MYPY_CHECK_RUNNING:
6-
from typing import Iterator
6+
from typing import Dict, Iterator
7+
8+
# The following comments and HTTP headers were originally added by
9+
# Donald Stufft in git commit 22c562429a61bb77172039e480873fb239dd8c03.
10+
#
11+
# We use Accept-Encoding: identity here because requests defaults to
12+
# accepting compressed responses. This breaks in a variety of ways
13+
# depending on how the server is configured.
14+
# - Some servers will notice that the file isn't a compressible file
15+
# and will leave the file alone and with an empty Content-Encoding
16+
# - Some servers will notice that the file is already compressed and
17+
# will leave the file alone, adding a Content-Encoding: gzip header
18+
# - Some servers won't notice anything at all and will take a file
19+
# that's already been compressed and compress it again, and set
20+
# the Content-Encoding: gzip header
21+
# By setting this to request only the identity encoding we're hoping
22+
# to eliminate the third case. Hopefully there does not exist a server
23+
# which when given a file will notice it is already compressed and that
24+
# you're not asking for a compressed file and will then decompress it
25+
# before sending because if that's the case I don't think it'll ever be
26+
# possible to make this work.
27+
HEADERS = {'Accept-Encoding': 'identity'} # type: Dict[str, str]
728

829

930
def response_chunks(response, chunk_size=CONTENT_CHUNK_SIZE):

tests/lib/requests_mocks.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def __init__(self, contents):
2828
self.status_code = 200
2929
self.connection = None
3030
self.url = None
31-
self.headers = {}
31+
self.headers = {'Content-Length': len(contents)}
3232
self.history = []
3333

3434
def raise_for_status(self):

tests/unit/test_network_lazy_wheel.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from zipfile import BadZipfile
2+
3+
from pip._vendor.pkg_resources import Requirement
4+
from pytest import fixture, mark, raises
5+
6+
from pip._internal.network.lazy_wheel import dist_from_wheel_url
7+
from pip._internal.network.session import PipSession
8+
from tests.lib.requests_mocks import MockResponse
9+
10+
MYPY_0_782_WHL = (
11+
'https://files.pythonhosted.org/packages/9d/65/'
12+
'b96e844150ce18b9892b155b780248955ded13a2581d31872e7daa90a503/'
13+
'mypy-0.782-py3-none-any.whl'
14+
)
15+
MYPY_0_782_REQS = {
16+
Requirement('typed-ast (<1.5.0,>=1.4.0)'),
17+
Requirement('typing-extensions (>=3.7.4)'),
18+
Requirement('mypy-extensions (<0.5.0,>=0.4.3)'),
19+
Requirement('psutil (>=4.0); extra == "dmypy"'),
20+
}
21+
22+
23+
@fixture
24+
def session():
25+
return PipSession()
26+
27+
28+
@mark.network
29+
def test_dist_from_wheel_url(session):
30+
"""Test if the acquired distribution contain correct information."""
31+
dist = dist_from_wheel_url('mypy', MYPY_0_782_WHL, session)
32+
assert dist.key == 'mypy'
33+
assert dist.version == '0.782'
34+
assert dist.extras == ['dmypy']
35+
assert set(dist.requires(dist.extras)) == MYPY_0_782_REQS
36+
37+
38+
@mark.network
39+
def test_dist_from_wheel_url_no_range(session, monkeypatch):
40+
"""Test handling when HTTP range requests are not supported."""
41+
monkeypatch.setattr(session, 'head', lambda *a, **kw: MockResponse(b''))
42+
with raises(RuntimeError):
43+
dist_from_wheel_url('mypy', MYPY_0_782_WHL, session)
44+
45+
46+
@mark.network
47+
def test_dist_from_wheel_url_not_zip(session):
48+
"""Test handling with the given URL does not point to a ZIP."""
49+
with raises(BadZipfile):
50+
dist_from_wheel_url('python', 'https://www.python.org/', session)

0 commit comments

Comments
 (0)