Do not cleanup archive download tempdir immediately (#7479)

chrahunt · web-flow · commit fedde5fa2cba · 2019-12-14T23:19:25.000+08:00
* Do not cleanup download tempdir immediately

The previous logic forced us to handle populating the download directory
in this function right next to the download and hash checking. By
extending the lifetime of the directory we can more easily separate the
code.

This also allows for additional optimizations later: by using metadata
from wheels directly instead of unpacking them, we can avoid extracting
wheels unnecessarily. Unpacked files can be easily 3x larger than the
archives themselves, so this should reduce disk utilization and general
IO significantly.
diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py
@@ -144,32 +144,32 @@ def unpack_http_url(
     hashes=None,  # type: Optional[Hashes]
 ):
     # type: (...) -> None
-    with TempDirectory(kind="unpack") as temp_dir:
-        # If a download dir is specified, is the file already downloaded there?
-        already_downloaded_path = None
-        if download_dir:
-            already_downloaded_path = _check_download_dir(
-                link, download_dir, hashes
-            )
+    temp_dir = TempDirectory(kind="unpack", globally_managed=True)
+    # If a download dir is specified, is the file already downloaded there?
+    already_downloaded_path = None
+    if download_dir:
+        already_downloaded_path = _check_download_dir(
+            link, download_dir, hashes
+        )
 
-        if already_downloaded_path:
-            from_path = already_downloaded_path
-            content_type = mimetypes.guess_type(from_path)[0]
-        else:
-            # let's download to a tmp dir
-            from_path, content_type = _download_http_url(
-                link, downloader, temp_dir.path, hashes
-            )
+    if already_downloaded_path:
+        from_path = already_downloaded_path
+        content_type = mimetypes.guess_type(from_path)[0]
+    else:
+        # let's download to a tmp dir
+        from_path, content_type = _download_http_url(
+            link, downloader, temp_dir.path, hashes
+        )
 
-        # unpack the archive to the build dir location. even when only
-        # downloading archives, they have to be unpacked to parse dependencies
-        unpack_file(from_path, location, content_type)
+    # unpack the archive to the build dir location. even when only
+    # downloading archives, they have to be unpacked to parse dependencies
+    unpack_file(from_path, location, content_type)
 
-        # a download dir is specified; let's copy the archive there
-        if download_dir and not os.path.exists(
-            os.path.join(download_dir, link.filename)
-        ):
-            _copy_file(from_path, download_dir, link)
+    # a download dir is specified; let's copy the archive there
+    if download_dir and not os.path.exists(
+        os.path.join(download_dir, link.filename)
+    ):
+        _copy_file(from_path, download_dir, link)
 
 
 def _copy2_ignoring_special_files(src, dest):