Skip to content

Commit 3eb85a0

Browse files
committed
Draft lazy zip over HTTP
1 parent 0b5ad47 commit 3eb85a0

File tree

4 files changed

+228
-26
lines changed

4 files changed

+228
-26
lines changed

news/70727978-e22a-427d-aa03-11ce55d8f6f9.trivial

Whitespace-only changes.

src/pip/_internal/network/download.py

+2-25
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pip._internal.cli.progress_bars import DownloadProgressProvider
1212
from pip._internal.models.index import PyPI
1313
from pip._internal.network.cache import is_from_cache
14-
from pip._internal.network.utils import response_chunks
14+
from pip._internal.network.utils import HEADERS, response_chunks
1515
from pip._internal.utils.misc import (
1616
format_size,
1717
redact_auth_from_url,
@@ -132,30 +132,7 @@ def _get_http_response_filename(resp, link):
132132
def _http_get_download(session, link):
133133
# type: (PipSession, Link) -> Response
134134
target_url = link.url.split('#', 1)[0]
135-
resp = session.get(
136-
target_url,
137-
# We use Accept-Encoding: identity here because requests
138-
# defaults to accepting compressed responses. This breaks in
139-
# a variety of ways depending on how the server is configured.
140-
# - Some servers will notice that the file isn't a compressible
141-
# file and will leave the file alone and with an empty
142-
# Content-Encoding
143-
# - Some servers will notice that the file is already
144-
# compressed and will leave the file alone and will add a
145-
# Content-Encoding: gzip header
146-
# - Some servers won't notice anything at all and will take
147-
# a file that's already been compressed and compress it again
148-
# and set the Content-Encoding: gzip header
149-
# By setting this to request only the identity encoding We're
150-
# hoping to eliminate the third case. Hopefully there does not
151-
# exist a server which when given a file will notice it is
152-
# already compressed and that you're not asking for a
153-
# compressed file and will then decompress it before sending
154-
# because if that's the case I don't think it'll ever be
155-
# possible to make this work.
156-
headers={"Accept-Encoding": "identity"},
157-
stream=True,
158-
)
135+
resp = session.get(target_url, headers=HEADERS, stream=True)
159136
resp.raise_for_status()
160137
return resp
161138

+204
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""Lazy ZIP over HTTP"""
2+
3+
__all__ = ['LazyZip']
4+
5+
from bisect import bisect_left, bisect_right
6+
from contextlib import contextmanager
7+
from tempfile import NamedTemporaryFile
8+
from zipfile import BadZipfile, ZipFile
9+
10+
from pip._vendor.requests.models import CONTENT_CHUNK_SIZE
11+
from pip._vendor.six.moves import range
12+
13+
from pip._internal.network.utils import HEADERS, response_chunks
14+
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
15+
16+
if MYPY_CHECK_RUNNING:
17+
from typing import Any, Dict, Iterator, List, Optional, Tuple
18+
19+
from pip._vendor.requests.models import Response
20+
21+
from pip._internal.network.session import PipSession
22+
23+
24+
class LazyZip:
25+
"""File-like object mapped to a ZIP file over HTTP.
26+
27+
This uses HTTP range requests to lazily fetch the file's content,
28+
which is supposed to be fed to ZipFile.
29+
"""
30+
31+
def __init__(self, session, url, chunk_size=CONTENT_CHUNK_SIZE):
32+
# type: (PipSession, str, int) -> None
33+
head = session.head(url, headers=HEADERS)
34+
head.raise_for_status()
35+
assert head.status_code == 200
36+
self._session, self._url, self._chunk_size = session, url, chunk_size
37+
self._length = int(head.headers['Content-Length'])
38+
self._file = NamedTemporaryFile()
39+
self.truncate(self._length)
40+
self._left = [] # type: List[int]
41+
self._right = [] # type: List[int]
42+
self._check_zip('bytes' in head.headers.get('Accept-Ranges', 'none'))
43+
44+
@property
45+
def mode(self):
46+
# type: () -> str
47+
"""Opening mode, which is always rb."""
48+
return 'rb'
49+
50+
@property
51+
def name(self):
52+
# type: () -> str
53+
"""File name."""
54+
return self._file.name
55+
56+
def seekable(self):
57+
# type: () -> bool
58+
"""Return whether random access is supported, which is True."""
59+
return True
60+
61+
def close(self):
62+
# type: () -> None
63+
"""Close the file."""
64+
self._file.close()
65+
66+
@property
67+
def closed(self):
68+
# type: () -> bool
69+
"""Whether the file is closed."""
70+
return self._file.closed
71+
72+
def read(self, size=-1):
73+
# type: (int) -> bytes
74+
"""Read up to size bytes from the object and return them.
75+
76+
As a convenience, if size is unspecified or -1,
77+
all bytes until EOF are returned. Fewer than
78+
size bytes may be returned if EOF is reached.
79+
"""
80+
start, length = self.tell(), self._length
81+
stop = start + size if 0 <= size <= length-start else length
82+
self._download(start, stop-1)
83+
return self._file.read(size)
84+
85+
def readable(self):
86+
# type: () -> bool
87+
"""Return whether the file is readable, which is True."""
88+
return True
89+
90+
def seek(self, offset, whence=0):
91+
# type: (int, int) -> int
92+
"""Change stream position and return the new absolute position.
93+
94+
Seek to offset relative position indicated by whence:
95+
* 0: Start of stream (the default). pos should be >= 0;
96+
* 1: Current position - pos may be negative;
97+
* 2: End of stream - pos usually negative.
98+
"""
99+
return self._file.seek(offset, whence)
100+
101+
def tell(self):
102+
# type: () -> int
103+
"""Return the current possition."""
104+
return self._file.tell()
105+
106+
def truncate(self, size=None):
107+
# type: (Optional[int]) -> int
108+
"""Resize the stream to the given size in bytes.
109+
110+
If size is unspecified resize to the current position.
111+
The current stream position isn't changed.
112+
113+
Return the new file size.
114+
"""
115+
return self._file.truncate(size)
116+
117+
def writable(self):
118+
# type: () -> bool
119+
"""Return False."""
120+
return False
121+
122+
def __enter__(self):
123+
# type: () -> LazyZip
124+
self._file.__enter__()
125+
return self
126+
127+
def __exit__(self, *exc):
128+
# type: (*Any) -> Optional[bool]
129+
return self._file.__exit__(*exc)
130+
131+
@contextmanager
132+
def _stay(self):
133+
# type: ()-> Iterator[None]
134+
"""Return a context manager keeping the position.
135+
136+
At the end of the block, seek back to original position.
137+
"""
138+
pos = self.tell()
139+
try:
140+
yield
141+
finally:
142+
self.seek(pos)
143+
144+
def _check_zip(self, range_request):
145+
# type: (bool) -> None
146+
"""Check and download until the file is a valid ZIP."""
147+
end = self._length - 1
148+
if not range_request:
149+
self._download(0, end)
150+
return
151+
for start in reversed(range(0, end, self._chunk_size)):
152+
self._download(start, end)
153+
with self._stay():
154+
try:
155+
# For read-only ZIP files, ZipFile only needs
156+
# methods read, seek, seekable and tell.
157+
# The best way to type-hint in this case is to use
158+
# Python 3.8+ typing.Protocol.
159+
ZipFile(self) # type: ignore
160+
except BadZipfile:
161+
pass
162+
else:
163+
break
164+
165+
def _stream_response(self, start, end, base_headers=HEADERS):
166+
# type: (int, int, Dict[str, str]) -> Response
167+
"""Return HTTP response to a range request from start to end."""
168+
headers = {'Range': 'bytes={}-{}'.format(start, end)}
169+
headers.update(base_headers)
170+
return self._session.get(self._url, headers=headers, stream=True)
171+
172+
def _merge(self, start, end, left, right):
173+
# type: (int, int, int, int) -> Iterator[Tuple[int, int]]
174+
"""Return an iterator of intervals to be fetched.
175+
176+
Args:
177+
start (int): Start of needed interval
178+
end (int): End of needed interval
179+
left (int): Index of first overlapping downloaded data
180+
right (int): Index after last overlapping downloaded data
181+
"""
182+
lslice, rslice = self._left[left:right], self._right[left:right]
183+
i = start = min([start]+lslice[:1])
184+
end = max([end]+rslice[-1:])
185+
for j, k in zip(lslice, rslice):
186+
if j > i:
187+
yield i, j-1
188+
i = k + 1
189+
if i <= end:
190+
yield i, end
191+
self._left[left:right], self._right[left:right] = [start], [end]
192+
193+
def _download(self, start, end):
194+
# type: (int, int) -> None
195+
"""Download bytes from start to end inclusively."""
196+
with self._stay():
197+
left = bisect_left(self._right, start)
198+
right = bisect_right(self._left, end)
199+
for start, end in self._merge(start, end, left, right):
200+
response = self._stream_response(start, end)
201+
response.raise_for_status()
202+
self.seek(start)
203+
for chunk in response_chunks(response, self._chunk_size):
204+
self._file.write(chunk)

src/pip/_internal/network/utils.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,28 @@
33
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
44

55
if MYPY_CHECK_RUNNING:
6-
from typing import Iterator
6+
from typing import Dict, Iterator
7+
8+
# The following comments and HTTP headers were originally added by
9+
# Donald Stufft in git commit 22c562429a61bb77172039e480873fb239dd8c03.
10+
#
11+
# We use Accept-Encoding: identity here because requests defaults to
12+
# accepting compressed responses. This breaks in a variety of ways
13+
# depending on how the server is configured.
14+
# - Some servers will notice that the file isn't a compressible file
15+
# and will leave the file alone and with an empty Content-Encoding
16+
# - Some servers will notice that the file is already compressed and
17+
# will leave the file alone, adding a Content-Encoding: gzip header
18+
# - Some servers won't notice anything at all and will take a file
19+
# that's already been compressed and compress it again, and set
20+
# the Content-Encoding: gzip header
21+
# By setting this to request only the identity encoding we're hoping
22+
# to eliminate the third case. Hopefully there does not exist a server
23+
# which when given a file will notice it is already compressed and that
24+
# you're not asking for a compressed file and will then decompress it
25+
# before sending because if that's the case I don't think it'll ever be
26+
# possible to make this work.
27+
HEADERS = {'Accept-Encoding': 'identity'} # type: Dict[str, str]
728

829

930
def response_chunks(response, chunk_size=CONTENT_CHUNK_SIZE):

0 commit comments

Comments
 (0)