Skip to content

Commit ee08992

Browse files
authored
fix(vendor)!: direct extraction for registry sources (#15514)
### What does this PR try to resolve? `PathSource::list_files` has some heurstic rules for listing files. Those rules are mainly designed for `cargo package`. Previously, cargo-vendor relies on those rules to understand what files to vendor. However, it shouldn't use those rules because: * Package extracted from a `.crate` tarball isn't Git-controlled, some rules may apply differently. * The extracted package already went through `PathSource::list_files` during packaging. It should be clean enough. * Should keep crate sources from registry sources in a pristine state, which is exactly what vendoring is meant for. Instead, we switch to direct extraction into the vendor directory to ensure source code is the same as in the `.crate` tarball. ### How should we test and review this PR? There is already a test `vendor::package_exclude` for the fix of #9054, though now I think it is not a good fix. The test change shows the correct behavior change. I am not sure if we want more tests. Also, there are some caveats with this fix: * The overwrite protection in `unpack_package` assumes the unpack directory is always `<pkg>-<version`>. We don't want to remove this, but for cargo-vendor supports vendoring without version suffix. For that case, we need a temporary staging area, and move the unpacked source then. * The heurstic in `PathSource::list_files` did something "good" in general cases, like excluding hidden directories. That means common directories like `.github` or `.config` won't be vendored. After this, those get included. This is another round of churns. We might want to get other `cargo-vendor` changes along with this in one single release. ### Additional information * Fixes #9054 * Fixes #9555 * Fixes #9575 * Fixes #11000 * Fixes #14034 * Fixes #15080 * Fixes #15090 This also opens a door for * #10310 * #13191 Since we are changing vendored source (again), we might want to remove the `.rej`/`.orig` special rules in cargo-vendor, as well as look into the new source-dedup vendor dir layout. <!-- TRIAGEBOT_START --> <!-- TRIAGEBOT_SUMMARY_START --> ### Summary Notes - [benchmark-result](#15514 (comment)) by [weihanglo](https://github.com/weihanglo) Generated by triagebot, see [help](https://forge.rust-lang.org/triagebot/note.html) for how to add more <!-- TRIAGEBOT_SUMMARY_DATA_START$${"entries_by_url":{"https://github.com/rust-lang/cargo/pull/15514#issuecomment-2870275766":{"title":"benchmark-result","comment_url":"https://github.com/rust-lang/cargo/pull/15514#issuecomment-2870275766","author":"weihanglo"}}}$$TRIAGEBOT_SUMMARY_DATA_END --> <!-- TRIAGEBOT_SUMMARY_END --> <!-- TRIAGEBOT_END -->
2 parents 2d662d9 + c2d3bc4 commit ee08992

File tree

3 files changed

+241
-158
lines changed

3 files changed

+241
-158
lines changed

src/cargo/ops/vendor.rs

Lines changed: 141 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,18 @@ use crate::core::SourceId;
33
use crate::core::{GitReference, Package, Workspace};
44
use crate::ops;
55
use crate::sources::path::PathSource;
6-
use crate::sources::PathEntry;
6+
use crate::sources::RegistrySource;
77
use crate::sources::SourceConfigMap;
88
use crate::sources::CRATES_IO_REGISTRY;
99
use crate::util::cache_lock::CacheLockMode;
1010
use crate::util::{try_canonicalize, CargoResult, GlobalContext};
11+
1112
use anyhow::{bail, Context as _};
1213
use cargo_util::{paths, Sha256};
14+
use cargo_util_schemas::core::SourceKind;
1315
use serde::Serialize;
16+
use walkdir::WalkDir;
17+
1418
use std::collections::HashSet;
1519
use std::collections::{BTreeMap, BTreeSet, HashMap};
1620
use std::ffi::OsStr;
@@ -35,7 +39,7 @@ pub fn vendor(ws: &Workspace<'_>, opts: &VendorOptions<'_>) -> CargoResult<()> {
3539
extra_workspaces.push(ws);
3640
}
3741
let workspaces = extra_workspaces.iter().chain(Some(ws)).collect::<Vec<_>>();
38-
let _lock = gctx.acquire_package_cache_lock(CacheLockMode::MutateExclusive)?;
42+
let _lock = gctx.acquire_package_cache_lock(CacheLockMode::DownloadExclusive)?;
3943
let vendor_config = sync(gctx, &workspaces, opts).context("failed to sync")?;
4044

4145
if gctx.shell().verbosity() != Verbosity::Quiet {
@@ -86,9 +90,16 @@ struct SourceReplacementCache<'gctx> {
8690
}
8791

8892
impl SourceReplacementCache<'_> {
89-
fn new(gctx: &GlobalContext) -> CargoResult<SourceReplacementCache<'_>> {
93+
fn new(
94+
gctx: &GlobalContext,
95+
respect_source_config: bool,
96+
) -> CargoResult<SourceReplacementCache<'_>> {
9097
Ok(SourceReplacementCache {
91-
map: SourceConfigMap::new(gctx)?,
98+
map: if respect_source_config {
99+
SourceConfigMap::new(gctx)
100+
} else {
101+
SourceConfigMap::empty(gctx)
102+
}?,
92103
cache: Default::default(),
93104
})
94105
}
@@ -111,14 +122,14 @@ fn sync(
111122
opts: &VendorOptions<'_>,
112123
) -> CargoResult<VendorConfig> {
113124
let dry_run = false;
114-
let canonical_destination = try_canonicalize(opts.destination);
115-
let canonical_destination = canonical_destination.as_deref().unwrap_or(opts.destination);
116-
let dest_dir_already_exists = canonical_destination.exists();
125+
let vendor_dir = try_canonicalize(opts.destination);
126+
let vendor_dir = vendor_dir.as_deref().unwrap_or(opts.destination);
127+
let vendor_dir_already_exists = vendor_dir.exists();
117128

118-
paths::create_dir_all(&canonical_destination)?;
129+
paths::create_dir_all(&vendor_dir)?;
119130
let mut to_remove = HashSet::new();
120131
if !opts.no_delete {
121-
for entry in canonical_destination.read_dir()? {
132+
for entry in vendor_dir.read_dir()? {
122133
let entry = entry?;
123134
if !entry
124135
.file_name()
@@ -130,19 +141,13 @@ fn sync(
130141
}
131142
}
132143

133-
let mut source_replacement_cache = SourceReplacementCache::new(gctx)?;
134-
135-
// First up attempt to work around rust-lang/cargo#5956. Apparently build
136-
// artifacts sprout up in Cargo's global cache for whatever reason, although
137-
// it's unsure what tool is causing these issues at this time. For now we
138-
// apply a heavy-hammer approach which is to delete Cargo's unpacked version
139-
// of each crate to start off with. After we do this we'll re-resolve and
140-
// redownload again, which should trigger Cargo to re-extract all the
141-
// crates.
142-
//
143-
// Note that errors are largely ignored here as this is a best-effort
144-
// attempt. If anything fails here we basically just move on to the next
145-
// crate to work with.
144+
let mut source_replacement_cache =
145+
SourceReplacementCache::new(gctx, opts.respect_source_config)?;
146+
147+
let mut checksums = HashMap::new();
148+
let mut ids = BTreeMap::new();
149+
150+
// Let's download all crates and start storing internal tables about them.
146151
for ws in workspaces {
147152
let (packages, resolve) = ops::resolve_ws(ws, dry_run)
148153
.with_context(|| format!("failed to load lockfile for {}", ws.root().display()))?;
@@ -152,54 +157,19 @@ fn sync(
152157
.with_context(|| format!("failed to download packages for {}", ws.root().display()))?;
153158

154159
for pkg in resolve.iter() {
155-
let sid = if opts.respect_source_config {
156-
source_replacement_cache.get(pkg.source_id())?
157-
} else {
158-
pkg.source_id()
159-
};
160+
let sid = source_replacement_cache.get(pkg.source_id())?;
160161

161-
// Don't delete actual source code!
162+
// Don't vendor path crates since they're already in the repository
162163
if sid.is_path() {
164+
// And don't delete actual source code!
163165
if let Ok(path) = sid.url().to_file_path() {
164166
if let Ok(path) = try_canonicalize(path) {
165167
to_remove.remove(&path);
166168
}
167169
}
168170
continue;
169171
}
170-
if sid.is_git() {
171-
continue;
172-
}
173172

174-
// Only delete sources that are safe to delete, i.e. they are caches.
175-
if sid.is_registry() {
176-
if let Ok(pkg) = packages.get_one(pkg) {
177-
drop(fs::remove_dir_all(pkg.root()));
178-
}
179-
continue;
180-
}
181-
}
182-
}
183-
184-
let mut checksums = HashMap::new();
185-
let mut ids = BTreeMap::new();
186-
187-
// Next up let's actually download all crates and start storing internal
188-
// tables about them.
189-
for ws in workspaces {
190-
let (packages, resolve) = ops::resolve_ws(ws, dry_run)
191-
.with_context(|| format!("failed to load lockfile for {}", ws.root().display()))?;
192-
193-
packages
194-
.get_many(resolve.iter())
195-
.with_context(|| format!("failed to download packages for {}", ws.root().display()))?;
196-
197-
for pkg in resolve.iter() {
198-
// No need to vendor path crates since they're already in the
199-
// repository
200-
if pkg.source_id().is_path() {
201-
continue;
202-
}
203173
ids.insert(
204174
pkg,
205175
packages
@@ -247,7 +217,7 @@ fn sync(
247217
};
248218

249219
sources.insert(id.source_id());
250-
let dst = canonical_destination.join(&dst_name);
220+
let dst = vendor_dir.join(&dst_name);
251221
to_remove.remove(&dst);
252222
let cksum = dst.join(".cargo-checksum.json");
253223
// Registries are the only immutable sources,
@@ -263,16 +233,89 @@ fn sync(
263233
)?;
264234

265235
let _ = fs::remove_dir_all(&dst);
266-
let pathsource = PathSource::new(src, id.source_id(), gctx);
267-
let paths = pathsource.list_files(pkg)?;
268-
let mut map = BTreeMap::new();
269-
cp_sources(pkg, src, &paths, &dst, &mut map, &mut tmp_buf, gctx)
270-
.with_context(|| format!("failed to copy over vendored sources for: {}", id))?;
236+
237+
let mut file_cksums = BTreeMap::new();
238+
239+
// Need this mapping anyway because we will directly consult registry sources,
240+
// otherwise builtin source replacement (sparse registry) won't be respected.
241+
let sid = source_replacement_cache.get(id.source_id())?;
242+
243+
if sid.is_registry() {
244+
// To keep the unpacked source from registry in a pristine state,
245+
// we'll do a direct extraction into the vendor directory.
246+
let registry = match sid.kind() {
247+
SourceKind::Registry | SourceKind::SparseRegistry => {
248+
RegistrySource::remote(sid, &Default::default(), gctx)?
249+
}
250+
SourceKind::LocalRegistry => {
251+
let path = sid.url().to_file_path().expect("local path");
252+
RegistrySource::local(sid, &path, &Default::default(), gctx)
253+
}
254+
_ => unreachable!("not registry source: {sid}"),
255+
};
256+
257+
let walkdir = |root| {
258+
WalkDir::new(root)
259+
.into_iter()
260+
// It is safe to skip errors,
261+
// since we'll hit them during copying/reading later anyway.
262+
.filter_map(|e| e.ok())
263+
// There should be no symlink in tarballs on crates.io,
264+
// but might be wrong for local registries.
265+
// Hence here be conservative and include symlinks.
266+
.filter(|e| e.file_type().is_file() || e.file_type().is_symlink())
267+
};
268+
let mut compute_file_cksums = |root| {
269+
for e in walkdir(root) {
270+
let path = e.path();
271+
let relative = path.strip_prefix(&dst).unwrap();
272+
let cksum = Sha256::new()
273+
.update_path(path)
274+
.map(Sha256::finish_hex)
275+
.with_context(|| format!("failed to checksum `{}`", path.display()))?;
276+
file_cksums.insert(relative.to_str().unwrap().replace("\\", "/"), cksum);
277+
}
278+
Ok::<_, anyhow::Error>(())
279+
};
280+
if dir_has_version_suffix {
281+
registry.unpack_package_in(id, &vendor_dir, &vendor_this)?;
282+
compute_file_cksums(&dst)?;
283+
} else {
284+
// Due to the extra sanity check in registry unpack
285+
// (ensure it contain only one top-level directory with name `pkg-version`),
286+
// we can only unpack a directory with version suffix,
287+
// and move it to the no suffix directory.
288+
let staging_dir = tempfile::Builder::new()
289+
.prefix(".vendor-staging")
290+
.tempdir_in(vendor_dir)?;
291+
let unpacked_src =
292+
registry.unpack_package_in(id, staging_dir.path(), &vendor_this)?;
293+
if let Err(e) = fs::rename(&unpacked_src, &dst) {
294+
// This fallback is mainly for Windows 10 versions earlier than 1607.
295+
// The destination of `fs::rename` can't be a diretory in older versions.
296+
// Can be removed once the minimal supported Windows version gets bumped.
297+
tracing::warn!("failed to `mv {unpacked_src:?} {dst:?}`: {e}");
298+
let paths: Vec<_> = walkdir(&unpacked_src).map(|e| e.into_path()).collect();
299+
cp_sources(pkg, src, &paths, &dst, &mut file_cksums, &mut tmp_buf, gctx)
300+
.with_context(|| format!("failed to copy vendored sources for {id}"))?;
301+
} else {
302+
compute_file_cksums(&dst)?;
303+
}
304+
}
305+
} else {
306+
let paths = PathSource::new(src, sid, gctx)
307+
.list_files(pkg)?
308+
.into_iter()
309+
.map(|p| p.into_path_buf())
310+
.collect::<Vec<_>>();
311+
cp_sources(pkg, src, &paths, &dst, &mut file_cksums, &mut tmp_buf, gctx)
312+
.with_context(|| format!("failed to copy vendored sources for {id}"))?;
313+
}
271314

272315
// Finally, emit the metadata about this package
273316
let json = serde_json::json!({
274317
"package": checksums.get(id),
275-
"files": map,
318+
"files": file_cksums,
276319
});
277320

278321
paths::write(&cksum, json.to_string())?;
@@ -347,9 +390,9 @@ fn sync(
347390
directory: opts.destination.to_string_lossy().replace("\\", "/"),
348391
},
349392
);
350-
} else if !dest_dir_already_exists {
393+
} else if !vendor_dir_already_exists {
351394
// Nothing to vendor. Remove the destination dir we've just created.
352-
paths::remove_dir(canonical_destination)?;
395+
paths::remove_dir(vendor_dir)?;
353396
}
354397

355398
Ok(VendorConfig { source: config })
@@ -358,36 +401,18 @@ fn sync(
358401
fn cp_sources(
359402
pkg: &Package,
360403
src: &Path,
361-
paths: &[PathEntry],
404+
paths: &[PathBuf],
362405
dst: &Path,
363406
cksums: &mut BTreeMap<String, String>,
364407
tmp_buf: &mut [u8],
365408
gctx: &GlobalContext,
366409
) -> CargoResult<()> {
367410
for p in paths {
368-
let p = p.as_ref();
369411
let relative = p.strip_prefix(&src).unwrap();
370412

371-
match relative.to_str() {
372-
// Skip git config files as they're not relevant to builds most of
373-
// the time and if we respect them (e.g. in git) then it'll
374-
// probably mess with the checksums when a vendor dir is checked
375-
// into someone else's source control
376-
Some(".gitattributes" | ".gitignore" | ".git") => continue,
377-
378-
// Temporary Cargo files
379-
Some(".cargo-ok") => continue,
380-
381-
// Skip patch-style orig/rej files. Published crates on crates.io
382-
// have `Cargo.toml.orig` which we don't want to use here and
383-
// otherwise these are rarely used as part of the build process.
384-
Some(filename) => {
385-
if filename.ends_with(".orig") || filename.ends_with(".rej") {
386-
continue;
387-
}
388-
}
389-
_ => {}
390-
};
413+
if !vendor_this(relative) {
414+
continue;
415+
}
391416

392417
// Join pathname components individually to make sure that the joined
393418
// path uses the correct directory separators everywhere, since
@@ -417,7 +442,7 @@ fn cp_sources(
417442
&dst,
418443
&mut dst_opts,
419444
&mut contents.as_bytes(),
420-
"Generated Cargo.toml",
445+
Path::new("Generated Cargo.toml"),
421446
tmp_buf,
422447
)?
423448
} else {
@@ -430,13 +455,7 @@ fn cp_sources(
430455
.with_context(|| format!("failed to stat {:?}", p))?;
431456
dst_opts.mode(src_metadata.mode());
432457
}
433-
copy_and_checksum(
434-
&dst,
435-
&mut dst_opts,
436-
&mut src,
437-
&p.display().to_string(),
438-
tmp_buf,
439-
)?
458+
copy_and_checksum(&dst, &mut dst_opts, &mut src, &p, tmp_buf)?
440459
};
441460

442461
cksums.insert(relative.to_str().unwrap().replace("\\", "/"), cksum);
@@ -562,7 +581,7 @@ fn copy_and_checksum<T: Read>(
562581
dst_path: &Path,
563582
dst_opts: &mut OpenOptions,
564583
contents: &mut T,
565-
contents_path: &str,
584+
contents_path: &Path,
566585
buf: &mut [u8],
567586
) -> CargoResult<String> {
568587
let mut dst = dst_opts
@@ -584,3 +603,25 @@ fn copy_and_checksum<T: Read>(
584603
.with_context(|| format!("failed to write to {:?}", dst_path))?;
585604
}
586605
}
606+
607+
/// Filters files we want to vendor.
608+
///
609+
/// `relative` is a path relative to the package root.
610+
fn vendor_this(relative: &Path) -> bool {
611+
match relative.to_str() {
612+
// Skip git config files as they're not relevant to builds most of
613+
// the time and if we respect them (e.g. in git) then it'll
614+
// probably mess with the checksums when a vendor dir is checked
615+
// into someone else's source control
616+
Some(".gitattributes" | ".gitignore" | ".git") => false,
617+
618+
// Temporary Cargo files
619+
Some(".cargo-ok") => false,
620+
621+
// Skip patch-style orig/rej files. Published crates on crates.io
622+
// have `Cargo.toml.orig` which we don't want to use here and
623+
// otherwise these are rarely used as part of the build process.
624+
Some(p) if p.ends_with(".orig") || p.ends_with(".rej") => false,
625+
_ => true,
626+
}
627+
}

0 commit comments

Comments
 (0)