diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 1823f2d7502..0661897e944 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -30,7 +30,7 @@ from pip._vendor.requests.models import Request, Response from pip._internal.metadata import BaseDistribution - from pip._internal.models.link import Link + from pip._internal.network.download import _FileDownload from pip._internal.req.req_install import InstallRequirement logger = logging.getLogger(__name__) @@ -819,17 +819,19 @@ class IncompleteDownloadError(DiagnosticPipError): reference = "incomplete-download" - def __init__( - self, link: Link, received: int, expected: int, *, retries: int - ) -> None: + def __init__(self, download: _FileDownload) -> None: # Dodge circular import. from pip._internal.utils.misc import format_size - download_status = f"{format_size(received)}/{format_size(expected)}" - if retries: - retry_status = f"after {retries} attempts " + assert download.size is not None + download_status = ( + f"{format_size(download.bytes_received)}/{format_size(download.size)}" + ) + if download.reattempts: + retry_status = f"after {download.reattempts + 1} attempts " hint = "Use --resume-retries to configure resume attempt limit." else: + # Download retrying is not enabled. retry_status = "" hint = "Consider using --resume-retries to enable download resumption." message = Text( @@ -839,7 +841,7 @@ def __init__( super().__init__( message=message, - context=f"URL: {link.redacted_url}", + context=f"URL: {download.link.redacted_url}", hint_stmt=hint, note_stmt="This is an issue with network connectivity, not pip.", ) diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index 40c36bf9fe7..1a00a5f246e 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -6,7 +6,8 @@ import logging import mimetypes import os -from collections.abc import Iterable +from collections.abc import Iterable, Mapping +from dataclasses import dataclass from http import HTTPStatus from typing import BinaryIO @@ -40,7 +41,7 @@ def _get_http_response_etag_or_last_modified(resp: Response) -> str | None: return resp.headers.get("etag", resp.headers.get("last-modified")) -def _prepare_download( +def _log_download( resp: Response, link: Link, progress_bar: str, @@ -134,28 +135,28 @@ def _get_http_response_filename(resp: Response, link: Link) -> str: return filename -def _http_get_download( - session: PipSession, - link: Link, - range_start: int | None = 0, - if_range: str | None = None, -) -> Response: - target_url = link.url.split("#", 1)[0] - headers = HEADERS.copy() - # request a partial download - if range_start: - headers["Range"] = f"bytes={range_start}-" - # make sure the file hasn't changed - if if_range: - headers["If-Range"] = if_range - try: - resp = session.get(target_url, headers=headers, stream=True) - raise_for_status(resp) - except NetworkConnectionError as e: - assert e.response is not None - logger.critical("HTTP error %s while getting %s", e.response.status_code, link) - raise - return resp +@dataclass +class _FileDownload: + """Stores the state of a single link download.""" + + link: Link + output_file: BinaryIO + size: int | None + bytes_received: int = 0 + reattempts: int = 0 + + def is_incomplete(self) -> bool: + return bool(self.size is not None and self.bytes_received < self.size) + + def write_chunk(self, data: bytes) -> None: + self.bytes_received += len(data) + self.output_file.write(data) + + def reset_file(self) -> None: + """Delete any saved data and reset progress to zero.""" + self.output_file.seek(0) + self.output_file.truncate() + self.bytes_received = 0 class Downloader: @@ -172,146 +173,106 @@ def __init__( self._progress_bar = progress_bar self._resume_retries = resume_retries - def __call__(self, link: Link, location: str) -> tuple[str, str]: - """Download the file given by link into location.""" - resp = _http_get_download(self._session, link) - # NOTE: The original download size needs to be passed down everywhere - # so if the download is resumed (with a HTTP Range request) the progress - # bar will report the right size. - total_length = _get_http_response_size(resp) - content_type = resp.headers.get("Content-Type", "") + def batch( + self, links: Iterable[Link], location: str + ) -> Iterable[tuple[Link, tuple[str, str]]]: + """Convenience method to download multiple links.""" + for link in links: + filepath, content_type = self(link, location) + yield link, (filepath, content_type) - filename = _get_http_response_filename(resp, link) - filepath = os.path.join(location, filename) + def __call__(self, link: Link, location: str) -> tuple[str, str]: + """Download a link and save it under location.""" + resp = self._http_get(link) + download_size = _get_http_response_size(resp) + filepath = os.path.join(location, _get_http_response_filename(resp, link)) with open(filepath, "wb") as content_file: - bytes_received = self._process_response( - resp, link, content_file, 0, total_length - ) - # If possible, check for an incomplete download and attempt resuming. - if total_length and bytes_received < total_length: - self._attempt_resume( - resp, link, content_file, total_length, bytes_received - ) + download = _FileDownload(link, content_file, download_size) + self._process_response(download, resp) + if download.is_incomplete(): + self._attempt_resumes_or_redownloads(download, resp) + content_type = resp.headers.get("Content-Type", "") return filepath, content_type - def _process_response( - self, - resp: Response, - link: Link, - content_file: BinaryIO, - bytes_received: int, - total_length: int | None, - ) -> int: - """Process the response and write the chunks to the file.""" - chunks = _prepare_download( - resp, link, self._progress_bar, total_length, range_start=bytes_received - ) - return self._write_chunks_to_file( - chunks, content_file, allow_partial=bool(total_length) + def _process_response(self, download: _FileDownload, resp: Response) -> None: + """Download and save chunks from a response.""" + chunks = _log_download( + resp, + download.link, + self._progress_bar, + download.size, + range_start=download.bytes_received, ) - - def _write_chunks_to_file( - self, chunks: Iterable[bytes], content_file: BinaryIO, *, allow_partial: bool - ) -> int: - """Write the chunks to the file and return the number of bytes received.""" - bytes_received = 0 try: for chunk in chunks: - bytes_received += len(chunk) - content_file.write(chunk) + download.write_chunk(chunk) except ReadTimeoutError as e: - # If partial downloads are OK (the download will be retried), don't bail. - if not allow_partial: + # If the download size is not known, then give up downloading the file. + if download.size is None: raise e - # Ensuring bytes_received is returned to attempt resume logger.warning("Connection timed out while downloading.") - return bytes_received - - def _attempt_resume( - self, - resp: Response, - link: Link, - content_file: BinaryIO, - total_length: int | None, - bytes_received: int, + def _attempt_resumes_or_redownloads( + self, download: _FileDownload, first_resp: Response ) -> None: - """Attempt to resume the download if connection was dropped.""" - etag_or_last_modified = _get_http_response_etag_or_last_modified(resp) - - attempts_left = self._resume_retries - while total_length and attempts_left and bytes_received < total_length: - attempts_left -= 1 + """Attempt to resume/restart the download if connection was dropped.""" + while download.reattempts < self._resume_retries and download.is_incomplete(): + assert download.size is not None + download.reattempts += 1 logger.warning( "Attempting to resume incomplete download (%s/%s, attempt %d)", - format_size(bytes_received), - format_size(total_length), - (self._resume_retries - attempts_left), + format_size(download.bytes_received), + format_size(download.size), + download.reattempts, ) try: - # Try to resume the download using a HTTP range request. - resume_resp = _http_get_download( - self._session, - link, - range_start=bytes_received, - if_range=etag_or_last_modified, - ) - + resume_resp = self._http_get_resume(download, should_match=first_resp) # Fallback: if the server responded with 200 (i.e., the file has # since been modified or range requests are unsupported) or any # other unexpected status, restart the download from the beginning. must_restart = resume_resp.status_code != HTTPStatus.PARTIAL_CONTENT if must_restart: - bytes_received, total_length, etag_or_last_modified = ( - self._reset_download_state(resume_resp, content_file) - ) + download.reset_file() + download.size = _get_http_response_size(resume_resp) + first_resp = resume_resp - bytes_received += self._process_response( - resume_resp, link, content_file, bytes_received, total_length - ) + self._process_response(download, resume_resp) except (ConnectionError, ReadTimeoutError, OSError): continue # No more resume attempts. Raise an error if the download is still incomplete. - if total_length and bytes_received < total_length: - os.remove(content_file.name) - raise IncompleteDownloadError( - link, bytes_received, total_length, retries=self._resume_retries + if download.is_incomplete(): + os.remove(download.output_file.name) + raise IncompleteDownloadError(download) + + def _http_get_resume( + self, download: _FileDownload, should_match: Response + ) -> Response: + """Issue a HTTP range request to resume the download.""" + # To better understand the download resumption logic, see the mdn web docs: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/Range_requests + headers = HEADERS.copy() + headers["Range"] = f"bytes={download.bytes_received}-" + # If possible, use a conditional range request to avoid corrupted + # downloads caused by the remote file changing in-between. + if identifier := _get_http_response_etag_or_last_modified(should_match): + headers["If-Range"] = identifier + return self._http_get(download.link, headers) + + def _http_get(self, link: Link, headers: Mapping[str, str] = HEADERS) -> Response: + target_url = link.url_without_fragment + try: + resp = self._session.get(target_url, headers=headers, stream=True) + raise_for_status(resp) + except NetworkConnectionError as e: + assert e.response is not None + logger.critical( + "HTTP error %s while getting %s", e.response.status_code, link ) - - def _reset_download_state( - self, - resp: Response, - content_file: BinaryIO, - ) -> tuple[int, int | None, str | None]: - """Reset the download state to restart downloading from the beginning.""" - content_file.seek(0) - content_file.truncate() - bytes_received = 0 - total_length = _get_http_response_size(resp) - etag_or_last_modified = _get_http_response_etag_or_last_modified(resp) - - return bytes_received, total_length, etag_or_last_modified - - -class BatchDownloader: - def __init__( - self, - session: PipSession, - progress_bar: str, - resume_retries: int, - ) -> None: - self._downloader = Downloader(session, progress_bar, resume_retries) - - def __call__( - self, links: Iterable[Link], location: str - ) -> Iterable[tuple[Link, tuple[str, str]]]: - """Download the files given by links into location.""" - for link in links: - filepath, content_type = self._downloader(link, location) - yield link, (filepath, content_type) + raise + return resp diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index ddbc975048e..908331eba29 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -29,7 +29,7 @@ from pip._internal.models.direct_url import ArchiveInfo from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel -from pip._internal.network.download import BatchDownloader, Downloader +from pip._internal.network.download import Downloader from pip._internal.network.lazy_wheel import ( HTTPRangeRequestUnsupported, dist_from_wheel_url, @@ -245,7 +245,6 @@ def __init__( self.build_tracker = build_tracker self._session = session self._download = Downloader(session, progress_bar, resume_retries) - self._batch_download = BatchDownloader(session, progress_bar, resume_retries) self.finder = finder # Where still-packed archives should be written to. If None, they are @@ -468,10 +467,7 @@ def _complete_partial_requirements( assert req.link links_to_fully_download[req.link] = req - batch_download = self._batch_download( - links_to_fully_download.keys(), - temp_dir, - ) + batch_download = self._download.batch(links_to_fully_download.keys(), temp_dir) for link, (filepath, _) in batch_download: logger.debug("Downloading link %s to %s", link, filepath) req = links_to_fully_download[link] diff --git a/tests/unit/test_network_download.py b/tests/unit/test_network_download.py index 6dfadf9e00f..1aa8c9267fa 100644 --- a/tests/unit/test_network_download.py +++ b/tests/unit/test_network_download.py @@ -12,8 +12,7 @@ from pip._internal.network.download import ( Downloader, _get_http_response_size, - _http_get_download, - _prepare_download, + _log_download, parse_content_disposition, sanitize_content_filename, ) @@ -77,7 +76,7 @@ ), ], ) -def test_prepare_download__log( +def test_log_download( caplog: pytest.LogCaptureFixture, url: str, headers: dict[str, str], @@ -93,7 +92,7 @@ def test_prepare_download__log( resp.from_cache = from_cache link = Link(url) total_length = _get_http_response_size(resp) - _prepare_download( + _log_download( resp, link, progress_bar="on", @@ -149,29 +148,6 @@ def test_sanitize_content_filename__platform_dependent( assert sanitize_content_filename(filename) == expected -@pytest.mark.parametrize( - "range_start, if_range, expected_headers", - [ - (None, None, HEADERS), - (1234, None, {**HEADERS, "Range": "bytes=1234-"}), - (1234, '"etag"', {**HEADERS, "Range": "bytes=1234-", "If-Range": '"etag"'}), - ], -) -def test_http_get_download( - range_start: int | None, - if_range: str | None, - expected_headers: dict[str, str], -) -> None: - session = PipSession() - session.get = MagicMock() - link = Link("http://example.com/foo.tgz") - with patch("pip._internal.network.download.raise_for_status"): - _http_get_download(session, link, range_start, if_range) - session.get.assert_called_once_with( - "http://example.com/foo.tgz", headers=expected_headers, stream=True - ) - - @pytest.mark.parametrize( "content_disposition, default_filename, expected", [ @@ -253,9 +229,9 @@ def test_parse_content_disposition( [(24, None), (36, None)], b"new-0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89", ), - # The downloader should fail after n resume_retries attempts. + # The downloader should fail after N resume_retries attempts. # This prevents the downloader from getting stuck if the connection - # is unstable and the server doesn't not support range requests. + # is unstable and the server does NOT support range requests. ( 1, [ @@ -265,7 +241,7 @@ def test_parse_content_disposition( [(24, None)], None, ), - # The downloader should use If-Range header to make the range + # The downloader should use the If-Range header to make the range # request conditional if it is possible to check for modifications # (e.g. if we know the creation time of the initial response). ( @@ -279,17 +255,28 @@ def test_parse_content_disposition( 200, b"0cfa7e9d-1868-4dd7-9fb3-", ), + ( + { + "content-length": "42", + "last-modified": "Wed, 21 Oct 2015 07:30:00 GMT", + }, + 200, + b"new-0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89", + ), ( { "content-length": "12", "last-modified": "Wed, 21 Oct 2015 07:54:00 GMT", }, - 206, + 200, b"f2561d5dfd89", ), ], - [(24, "Wed, 21 Oct 2015 07:28:00 GMT")], - b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89", + [ + (24, "Wed, 21 Oct 2015 07:28:00 GMT"), + (40, "Wed, 21 Oct 2015 07:30:00 GMT"), + ], + b"f2561d5dfd89", ), # ETag is preferred over Last-Modified for the If-Range condition. ( @@ -310,12 +297,12 @@ def test_parse_content_disposition( "last-modified": "Wed, 21 Oct 2015 07:54:00 GMT", "etag": '"33a64df551425fcc55e4d42a148795d9f25f89d4"', }, - 206, + 200, b"f2561d5dfd89", ), ], [(24, '"33a64df551425fcc55e4d42a148795d9f25f89d4"')], - b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89", + b"f2561d5dfd89", ), ], ) @@ -323,7 +310,7 @@ def test_downloader( resume_retries: int, mock_responses: list[tuple[dict[str, str], int, bytes]], # list of (range_start, if_range) - expected_resume_args: list[tuple[int | None, int | None]], + expected_resume_args: list[tuple[int | None, str | None]], # expected_bytes is None means the download should fail expected_bytes: bytes | None, tmpdir: Path, @@ -338,9 +325,9 @@ def test_downloader( resp.headers = headers resp.status_code = status_code responses.append(resp) - _http_get_download = MagicMock(side_effect=responses) + _http_get_mock = MagicMock(side_effect=responses) - with patch("pip._internal.network.download._http_get_download", _http_get_download): + with patch.object(Downloader, "_http_get", _http_get_mock): if expected_bytes is None: remove = MagicMock(return_value=None) with patch("os.remove", remove): @@ -354,9 +341,12 @@ def test_downloader( downloaded_bytes = downloaded_file.read() assert downloaded_bytes == expected_bytes - calls = [call(session, link)] # the initial request + calls = [call(link)] # the initial GET request for range_start, if_range in expected_resume_args: - calls.append(call(session, link, range_start=range_start, if_range=if_range)) + headers = {**HEADERS, "Range": f"bytes={range_start}-"} + if if_range: + headers["If-Range"] = if_range + calls.append(call(link, headers)) - # Make sure that the download makes additional requests for resumption - _http_get_download.assert_has_calls(calls) + # Make sure that the downloader makes additional requests for resumption + _http_get_mock.assert_has_calls(calls)