Draft lazy zip over HTTP

McSinyx · McSinyx · commit 3eb85a061937 · 2020-06-26T15:15:28.000+07:00
diff --git a/news/70727978-e22a-427d-aa03-11ce55d8f6f9.trivial b/news/70727978-e22a-427d-aa03-11ce55d8f6f9.trivial
diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py
@@ -11,7 +11,7 @@
 from pip._internal.cli.progress_bars import DownloadProgressProvider
 from pip._internal.models.index import PyPI
 from pip._internal.network.cache import is_from_cache
-from pip._internal.network.utils import response_chunks
+from pip._internal.network.utils import HEADERS, response_chunks
 from pip._internal.utils.misc import (
     format_size,
     redact_auth_from_url,
@@ -132,30 +132,7 @@ def _get_http_response_filename(resp, link):
 def _http_get_download(session, link):
     # type: (PipSession, Link) -> Response
     target_url = link.url.split('#', 1)[0]
-    resp = session.get(
-        target_url,
-        # We use Accept-Encoding: identity here because requests
-        # defaults to accepting compressed responses. This breaks in
-        # a variety of ways depending on how the server is configured.
-        # - Some servers will notice that the file isn't a compressible
-        #   file and will leave the file alone and with an empty
-        #   Content-Encoding
-        # - Some servers will notice that the file is already
-        #   compressed and will leave the file alone and will add a
-        #   Content-Encoding: gzip header
-        # - Some servers won't notice anything at all and will take
-        #   a file that's already been compressed and compress it again
-        #   and set the Content-Encoding: gzip header
-        # By setting this to request only the identity encoding We're
-        # hoping to eliminate the third case. Hopefully there does not
-        # exist a server which when given a file will notice it is
-        # already compressed and that you're not asking for a
-        # compressed file and will then decompress it before sending
-        # because if that's the case I don't think it'll ever be
-        # possible to make this work.
-        headers={"Accept-Encoding": "identity"},
-        stream=True,
-    )
+    resp = session.get(target_url, headers=HEADERS, stream=True)
     resp.raise_for_status()
     return resp
 
diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py
@@ -0,0 +1,204 @@
+"""Lazy ZIP over HTTP"""
+
+__all__ = ['LazyZip']
+
+from bisect import bisect_left, bisect_right
+from contextlib import contextmanager
+from tempfile import NamedTemporaryFile
+from zipfile import BadZipfile, ZipFile
+
+from pip._vendor.requests.models import CONTENT_CHUNK_SIZE
+from pip._vendor.six.moves import range
+
+from pip._internal.network.utils import HEADERS, response_chunks
+from pip._internal.utils.typing import MYPY_CHECK_RUNNING
+
+if MYPY_CHECK_RUNNING:
+    from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+    from pip._vendor.requests.models import Response
+
+    from pip._internal.network.session import PipSession
+
+
+class LazyZip:
+    """File-like object mapped to a ZIP file over HTTP.
+
+    This uses HTTP range requests to lazily fetch the file's content,
+    which is supposed to be fed to ZipFile.
+    """
+
+    def __init__(self, session, url, chunk_size=CONTENT_CHUNK_SIZE):
+        # type: (PipSession, str, int) -> None
+        head = session.head(url, headers=HEADERS)
+        head.raise_for_status()
+        assert head.status_code == 200
+        self._session, self._url, self._chunk_size = session, url, chunk_size
+        self._length = int(head.headers['Content-Length'])
+        self._file = NamedTemporaryFile()
+        self.truncate(self._length)
+        self._left = []  # type: List[int]
+        self._right = []  # type: List[int]
+        self._check_zip('bytes' in head.headers.get('Accept-Ranges', 'none'))
+
+    @property
+    def mode(self):
+        # type: () -> str
+        """Opening mode, which is always rb."""
+        return 'rb'
+
+    @property
+    def name(self):
+        # type: () -> str
+        """File name."""
+        return self._file.name
+
+    def seekable(self):
+        # type: () -> bool
+        """Return whether random access is supported, which is True."""
+        return True
+
+    def close(self):
+        # type: () -> None
+        """Close the file."""
+        self._file.close()
+
+    @property
+    def closed(self):
+        # type: () -> bool
+        """Whether the file is closed."""
+        return self._file.closed
+
+    def read(self, size=-1):
+        # type: (int) -> bytes
+        """Read up to size bytes from the object and return them.
+
+        As a convenience, if size is unspecified or -1,
+        all bytes until EOF are returned.  Fewer than
+        size bytes may be returned if EOF is reached.
+        """
+        start, length = self.tell(), self._length
+        stop = start + size if 0 <= size <= length-start else length
+        self._download(start, stop-1)
+        return self._file.read(size)
+
+    def readable(self):
+        # type: () -> bool
+        """Return whether the file is readable, which is True."""
+        return True
+
+    def seek(self, offset, whence=0):
+        # type: (int, int) -> int
+        """Change stream position and return the new absolute position.
+
+        Seek to offset relative position indicated by whence:
+        * 0: Start of stream (the default).  pos should be >= 0;
+        * 1: Current position - pos may be negative;
+        * 2: End of stream - pos usually negative.
+        """
+        return self._file.seek(offset, whence)
+
+    def tell(self):
+        # type: () -> int
+        """Return the current possition."""
+        return self._file.tell()
+
+    def truncate(self, size=None):
+        # type: (Optional[int]) -> int
+        """Resize the stream to the given size in bytes.
+
+        If size is unspecified resize to the current position.
+        The current stream position isn't changed.
+
+        Return the new file size.
+        """
+        return self._file.truncate(size)
+
+    def writable(self):
+        # type: () -> bool
+        """Return False."""
+        return False
+
+    def __enter__(self):
+        # type: () -> LazyZip
+        self._file.__enter__()
+        return self
+
+    def __exit__(self, *exc):
+        # type: (*Any) -> Optional[bool]
+        return self._file.__exit__(*exc)
+
+    @contextmanager
+    def _stay(self):
+        # type: ()-> Iterator[None]
+        """Return a context manager keeping the position.
+
+        At the end of the block, seek back to original position.
+        """
+        pos = self.tell()
+        try:
+            yield
+        finally:
+            self.seek(pos)
+
+    def _check_zip(self, range_request):
+        # type: (bool) -> None
+        """Check and download until the file is a valid ZIP."""
+        end = self._length - 1
+        if not range_request:
+            self._download(0, end)
+            return
+        for start in reversed(range(0, end, self._chunk_size)):
+            self._download(start, end)
+            with self._stay():
+                try:
+                    # For read-only ZIP files, ZipFile only needs
+                    # methods read, seek, seekable and tell.
+                    # The best way to type-hint in this case is to use
+                    # Python 3.8+ typing.Protocol.
+                    ZipFile(self)  # type: ignore
+                except BadZipfile:
+                    pass
+                else:
+                    break
+
+    def _stream_response(self, start, end, base_headers=HEADERS):
+        # type: (int, int, Dict[str, str]) -> Response
+        """Return HTTP response to a range request from start to end."""
+        headers = {'Range': 'bytes={}-{}'.format(start, end)}
+        headers.update(base_headers)
+        return self._session.get(self._url, headers=headers, stream=True)
+
+    def _merge(self, start, end, left, right):
+        # type: (int, int, int, int) -> Iterator[Tuple[int, int]]
+        """Return an iterator of intervals to be fetched.
+
+        Args:
+            start (int): Start of needed interval
+            end (int): End of needed interval
+            left (int): Index of first overlapping downloaded data
+            right (int): Index after last overlapping downloaded data
+        """
+        lslice, rslice = self._left[left:right], self._right[left:right]
+        i = start = min([start]+lslice[:1])
+        end = max([end]+rslice[-1:])
+        for j, k in zip(lslice, rslice):
+            if j > i:
+                yield i, j-1
+            i = k + 1
+        if i <= end:
+            yield i, end
+        self._left[left:right], self._right[left:right] = [start], [end]
+
+    def _download(self, start, end):
+        # type: (int, int) -> None
+        """Download bytes from start to end inclusively."""
+        with self._stay():
+            left = bisect_left(self._right, start)
+            right = bisect_right(self._left, end)
+            for start, end in self._merge(start, end, left, right):
+                response = self._stream_response(start, end)
+                response.raise_for_status()
+                self.seek(start)
+                for chunk in response_chunks(response, self._chunk_size):
+                    self._file.write(chunk)
diff --git a/src/pip/_internal/network/utils.py b/src/pip/_internal/network/utils.py
@@ -3,7 +3,28 @@
 from pip._internal.utils.typing import MYPY_CHECK_RUNNING
 
 if MYPY_CHECK_RUNNING:
-    from typing import Iterator
+    from typing import Dict, Iterator
+
+# The following comments and HTTP headers were originally added by
+# Donald Stufft in git commit 22c562429a61bb77172039e480873fb239dd8c03.
+#
+# We use Accept-Encoding: identity here because requests defaults to
+# accepting compressed responses. This breaks in a variety of ways
+# depending on how the server is configured.
+# - Some servers will notice that the file isn't a compressible file
+#   and will leave the file alone and with an empty Content-Encoding
+# - Some servers will notice that the file is already compressed and
+#   will leave the file alone, adding a Content-Encoding: gzip header
+# - Some servers won't notice anything at all and will take a file
+#   that's already been compressed and compress it again, and set
+#   the Content-Encoding: gzip header
+# By setting this to request only the identity encoding we're hoping
+# to eliminate the third case.  Hopefully there does not exist a server
+# which when given a file will notice it is already compressed and that
+# you're not asking for a compressed file and will then decompress it
+# before sending because if that's the case I don't think it'll ever be
+# possible to make this work.
+HEADERS = {'Accept-Encoding': 'identity'}  # type: Dict[str, str]
 
 
 def response_chunks(response, chunk_size=CONTENT_CHUNK_SIZE):