Skip to content

Parallelize bytecode compilation ✨ #13247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions news/13247.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Bytecode compilation is parallelized to significantly speed up installation of
large/many packages. By default, the number of workers matches the available CPUs
(up to a hard-coded limit), but can be adjusted using the ``--install-jobs``
option. To disable parallelization, pass ``--install-jobs 1``.
33 changes: 33 additions & 0 deletions src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,39 @@ def check_list_path_option(options: Values) -> None:
)


def _handle_jobs(
option: Option, opt_str: str, value: str, parser: OptionParser
) -> None:
if value == "auto":
setattr(parser.values, option.dest, "auto")
return

try:
if (count := int(value)) > 0:
setattr(parser.values, option.dest, count)
return
except ValueError:
pass

msg = "should be a positive integer or 'auto'"
raise_option_error(parser, option=option, msg=msg)


install_jobs: Callable[..., Option] = partial(
Option,
"--install-jobs",
dest="install_jobs",
default="auto",
type=str,
action="callback",
callback=_handle_jobs,
help=(
"Maximum number of workers to use while installing packages. "
"To disable parallelization, pass 1. (default: %default)"
),
)


##########
# groups #
##########
Expand Down
7 changes: 7 additions & 0 deletions src/pip/_internal/commands/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ def add_options(self) -> None:
),
)

self.cmd_opts.add_option(cmdoptions.install_jobs())

@with_cleanup
def run(self, options: Values, args: List[str]) -> int:
if options.use_user_site and options.target_dir is not None:
Expand Down Expand Up @@ -416,6 +418,10 @@ def run(self, options: Values, args: List[str]) -> int:
# we're not modifying it.
modifying_pip = pip_req.satisfied_by is None
protect_pip_from_modification_on_windows(modifying_pip=modifying_pip)
if modifying_pip:
# Parallelization will re-import pip when starting new workers
# during installation which is unsafe if pip is being modified.
options.install_jobs = 1

Comment on lines +421 to 425
Copy link
Member Author

@ichard26 ichard26 Feb 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the concurrent executors may spin up new workers midway through package installation, a newly installed (and incompatible) pip may be imported which could result in a crash (as seen in CI). This is a shame as parallelization does improve self-install time by ~200ms locally, but alas, pip should not crash even if we're trying to install 10 year old pip.

Technically, we could avoid this by patching __main__ to point a module that's completely outside of the pip namespace (it has to be outside because it's the import of pip.__init__ that's blowing up the tests) but I'm strongly opposed to adding a new top-level module such as __pip_empty.py to our distributions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid the global impact, we could special-case pip and use a serial compiler no matter what for pip, but parallelization is generally unsafe when modifying pip so this isn't a good idea either.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wait, this is even more problematic than I initially thought because the workers could import a new pip was maliciously overwritten. We had a security report similar to this for the self- version check. Hmm, I'll need to go back to the drawing board. Hopefully there is a way to avoid the extra imports on worker startup, otherwise, I'm going to have to use multiprocessing.Pool directly and pay the worker startup overhead all at once... :(

reqs_to_build = [
r
Expand Down Expand Up @@ -465,6 +471,7 @@ def run(self, options: Values, args: List[str]) -> int:
use_user_site=options.use_user_site,
pycompile=options.compile,
progress_bar=options.progress_bar,
workers=options.install_jobs,
)

lib_locations = get_lib_location_guesses(
Expand Down
38 changes: 12 additions & 26 deletions src/pip/_internal/operations/install/wheel.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
"""Support for installing and building the "wheel" binary package format."""

import collections
import compileall
import contextlib
import csv
import importlib
import logging
import os.path
import re
import shutil
import sys
import warnings
from base64 import urlsafe_b64encode
from email.message import Message
from io import StringIO
from itertools import chain, filterfalse, starmap
from typing import (
IO,
Expand Down Expand Up @@ -51,6 +47,7 @@
from pip._internal.models.scheme import SCHEME_KEYS, Scheme
from pip._internal.utils.filesystem import adjacent_tmp_file, replace
from pip._internal.utils.misc import ensure_dir, hash_file, partition
from pip._internal.utils.pyc_compile import BytecodeCompiler
from pip._internal.utils.unpacking import (
current_umask,
is_within_directory,
Expand Down Expand Up @@ -417,12 +414,12 @@ def make(
return super().make(specification, options)


def _install_wheel( # noqa: C901, PLR0915 function is too long
def _install_wheel( # noqa: C901 function is too long
name: str,
wheel_zip: ZipFile,
wheel_path: str,
scheme: Scheme,
pycompile: bool = True,
pycompiler: Optional[BytecodeCompiler],
warn_script_location: bool = True,
direct_url: Optional[DirectUrl] = None,
requested: bool = False,
Expand Down Expand Up @@ -601,25 +598,14 @@ def pyc_source_file_paths() -> Generator[str, None, None]:
continue
yield full_installed_path

def pyc_output_path(path: str) -> str:
"""Return the path the pyc file would have been written to."""
return importlib.util.cache_from_source(path)

# Compile all of the pyc files for the installed files
if pycompile:
with contextlib.redirect_stdout(StringIO()) as stdout:
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
for path in pyc_source_file_paths():
success = compileall.compile_file(path, force=True, quiet=True)
if success:
pyc_path = pyc_output_path(path)
assert os.path.exists(pyc_path)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed this assert as part of this PR as I don't think we've ever received a bug report about the .pyc file not existing. If it were to be missing, that sounds like a stdlib bug. I'd be happy to add it back if desired, though.

pyc_record_path = cast(
"RecordPath", pyc_path.replace(os.path.sep, "/")
)
record_installed(pyc_record_path, pyc_path)
logger.debug(stdout.getvalue())
if pycompiler is not None:
for module in pycompiler(pyc_source_file_paths()):
if module.is_success:
pyc_record_path = module.pyc_path.replace(os.path.sep, "/")
record_installed(RecordPath(pyc_record_path), module.pyc_path)
if output := module.compile_output:
logger.debug(output)

maker = PipScriptMaker(None, scheme.scripts)

Expand Down Expand Up @@ -718,7 +704,7 @@ def install_wheel(
wheel_path: str,
scheme: Scheme,
req_description: str,
pycompile: bool = True,
pycompiler: Optional[BytecodeCompiler] = None,
warn_script_location: bool = True,
direct_url: Optional[DirectUrl] = None,
requested: bool = False,
Expand All @@ -730,7 +716,7 @@ def install_wheel(
wheel_zip=z,
wheel_path=wheel_path,
scheme=scheme,
pycompile=pycompile,
pycompiler=pycompiler,
warn_script_location=warn_script_location,
direct_url=direct_url,
requested=requested,
Expand Down
41 changes: 38 additions & 3 deletions src/pip/_internal/req/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import collections
import logging
from contextlib import nullcontext
from dataclasses import dataclass
from typing import Generator, List, Optional, Sequence, Tuple
from functools import partial
from typing import Generator, Iterable, List, Optional, Sequence, Tuple
from zipfile import ZipFile

from pip._internal.cli.progress_bars import get_install_progress_renderer
from pip._internal.utils.logging import indent_log
from pip._internal.utils.pyc_compile import WorkerSetting, create_bytecode_compiler

from .req_file import parse_requirements
from .req_install import InstallRequirement
Expand Down Expand Up @@ -33,6 +37,28 @@ def _validate_requirements(
yield req.name, req


def _does_python_size_surpass_threshold(
requirements: Iterable[InstallRequirement], threshold: int
) -> bool:
Comment on lines +40 to +42
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This receives and checks against the threshold instead of simply returning the total code size to avoid inspecting more zip files than necessary (to minimize the small, but real time cost).

"""Inspect wheels to check whether there is enough .py code to
enable bytecode parallelization.
"""
py_size = 0
for req in requirements:
if not req.local_file_path or not req.is_wheel:
# No wheel to inspect as this is a legacy editable.
continue

with ZipFile(req.local_file_path, allowZip64=True) as wheel_file:
for entry in wheel_file.infolist():
if entry.filename.endswith(".py"):
py_size += entry.file_size
if py_size > threshold:
return True

return False


def install_given_reqs(
requirements: List[InstallRequirement],
global_options: Sequence[str],
Expand All @@ -43,6 +69,7 @@ def install_given_reqs(
use_user_site: bool,
pycompile: bool,
progress_bar: str,
workers: WorkerSetting,
) -> List[InstallationResult]:
"""
Install everything in the given list.
Expand All @@ -68,7 +95,15 @@ def install_given_reqs(
)
items = renderer(items)

with indent_log():
if pycompile:
code_size_check = partial(
_does_python_size_surpass_threshold, to_install.values()
)
pycompiler = create_bytecode_compiler(workers, code_size_check)
else:
pycompiler = None

with indent_log(), pycompiler or nullcontext():
for requirement in items:
req_name = requirement.name
assert req_name is not None
Expand All @@ -87,7 +122,7 @@ def install_given_reqs(
prefix=prefix,
warn_script_location=warn_script_location,
use_user_site=use_user_site,
pycompile=pycompile,
pycompiler=pycompiler,
)
except Exception:
# if install did not succeed, rollback previous uninstall
Expand Down
5 changes: 3 additions & 2 deletions src/pip/_internal/req/req_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
redact_auth_from_url,
)
from pip._internal.utils.packaging import get_requirement
from pip._internal.utils.pyc_compile import BytecodeCompiler
from pip._internal.utils.subprocess import runner_with_spinner_message
from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
from pip._internal.utils.unpacking import unpack_file
Expand Down Expand Up @@ -812,7 +813,7 @@ def install(
prefix: Optional[str] = None,
warn_script_location: bool = True,
use_user_site: bool = False,
pycompile: bool = True,
pycompiler: Optional[BytecodeCompiler] = None,
) -> None:
assert self.req is not None
scheme = get_scheme(
Expand Down Expand Up @@ -869,7 +870,7 @@ def install(
self.local_file_path,
scheme=scheme,
req_description=str(self.req),
pycompile=pycompile,
pycompiler=pycompiler,
warn_script_location=warn_script_location,
direct_url=self.download_info if self.is_direct else None,
requested=self.user_supplied,
Expand Down
Loading