Skip to content

Commit 3570ee7

Browse files
committed
Add a nonstandard shallow clone for GitHub
1 parent 0b84a35 commit 3570ee7

File tree

4 files changed

+182
-59
lines changed

4 files changed

+182
-59
lines changed

src/cargo/core/source/source_id.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ use crate::sources::{DirectorySource, CRATES_IO_DOMAIN, CRATES_IO_INDEX, CRATES_
44
use crate::sources::{GitSource, PathSource, RegistrySource};
55
use crate::util::{CanonicalUrl, CargoResult, Config, IntoUrl};
66
use log::trace;
7-
use serde::de;
8-
use serde::ser;
7+
use serde::{de, ser, Serialize};
98
use std::cmp::{self, Ordering};
109
use std::collections::HashSet;
1110
use std::fmt::{self, Formatter};
@@ -58,7 +57,7 @@ enum SourceKind {
5857
}
5958

6059
/// Information to find a specific commit in a Git repository.
61-
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
60+
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)]
6261
pub enum GitReference {
6362
/// From a tag.
6463
Tag(String),

src/cargo/sources/git/source.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ impl<'cfg> Source for GitSource<'cfg> {
144144
// database, then try to resolve our reference with the preexisting
145145
// repository.
146146
(None, Some(db)) if self.config.offline() => {
147-
let rev = db.resolve(&self.manifest_reference).with_context(|| {
147+
let rev = db.resolve_to_object(&self.manifest_reference).with_context(|| {
148148
"failed to lookup reference in preexisting repository, and \
149149
can't check for updates in offline mode (--offline)"
150150
})?;

src/cargo/sources/git/utils.rs

+174-40
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,13 @@ use cargo_util::{paths, ProcessBuilder};
99
use curl::easy::List;
1010
use git2::{self, ErrorClass, ObjectType, Oid};
1111
use log::{debug, info};
12-
use serde::ser;
13-
use serde::Serialize;
12+
use serde::{ser, Deserialize, Serialize};
1413
use std::borrow::Cow;
1514
use std::env;
1615
use std::fmt;
1716
use std::path::{Path, PathBuf};
1817
use std::process::Command;
19-
use std::str;
18+
use std::str::{self, FromStr};
2019
use std::time::{Duration, Instant};
2120
use url::Url;
2221

@@ -28,6 +27,17 @@ where
2827
s.collect_str(t)
2928
}
3029

30+
fn deserialize_str<'de, D, T>(deserializer: D) -> Result<T, D::Error>
31+
where
32+
T: FromStr,
33+
<T as FromStr>::Err: fmt::Display,
34+
D: serde::Deserializer<'de>,
35+
{
36+
let buf = String::deserialize(deserializer)?;
37+
38+
FromStr::from_str(&buf).map_err(serde::de::Error::custom)
39+
}
40+
3141
pub struct GitShortID(git2::Buf);
3242

3343
impl GitShortID {
@@ -78,8 +88,25 @@ impl GitRemote {
7888
&self.url
7989
}
8090

81-
pub fn rev_for(&self, path: &Path, reference: &GitReference) -> CargoResult<git2::Oid> {
82-
reference.resolve(&self.db_at(path)?.repo)
91+
/// Finds the Oid associated with the reference. The result is guaranteed to be on disk.
92+
/// But may not be the object the reference points to!
93+
/// For example, the reference points to a Commit and this may return the Tree that commit points do.
94+
pub fn rev_to_object_for(
95+
&self,
96+
path: &Path,
97+
reference: &GitReference,
98+
) -> CargoResult<git2::Oid> {
99+
reference.resolve_to_object(&self.db_at(path)?.repo)
100+
}
101+
102+
/// Finds the Oid of the Commit the reference points to. But the result may not be on disk!
103+
/// For example, the reference points to a Commit and we have only cloned the Tree.
104+
pub fn rev_to_commit_for(
105+
&self,
106+
path: &Path,
107+
reference: &GitReference,
108+
) -> CargoResult<git2::Oid> {
109+
reference.resolve_to_commit(&self.db_at(path)?.repo)
83110
}
84111

85112
pub fn checkout(
@@ -104,7 +131,7 @@ impl GitRemote {
104131
}
105132
}
106133
None => {
107-
if let Ok(rev) = reference.resolve(&db.repo) {
134+
if let Ok(rev) = reference.resolve_to_object(&db.repo) {
108135
return Ok((db, rev));
109136
}
110137
}
@@ -123,7 +150,7 @@ impl GitRemote {
123150
.context(format!("failed to clone into: {}", into.display()))?;
124151
let rev = match locked_rev {
125152
Some(rev) => rev,
126-
None => reference.resolve(&repo)?,
153+
None => reference.resolve_to_object(&repo)?,
127154
};
128155

129156
Ok((
@@ -179,13 +206,65 @@ impl GitDatabase {
179206
self.repo.revparse_single(&oid.to_string()).is_ok()
180207
}
181208

182-
pub fn resolve(&self, r: &GitReference) -> CargoResult<git2::Oid> {
183-
r.resolve(&self.repo)
209+
pub fn resolve_to_object(&self, r: &GitReference) -> CargoResult<git2::Oid> {
210+
r.resolve_to_object(&self.repo)
184211
}
185212
}
186213

214+
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
215+
struct ShallowDataBlob {
216+
#[serde(serialize_with = "serialize_str")]
217+
#[serde(deserialize_with = "deserialize_str")]
218+
tree: git2::Oid,
219+
#[serde(serialize_with = "serialize_str")]
220+
#[serde(deserialize_with = "deserialize_str")]
221+
etag: git2::Oid,
222+
}
223+
224+
#[test]
225+
fn check_with_git_hub() {
226+
panic!(
227+
r#"nonstandard shallow clone may be worse than a full check out.
228+
This test is here to make sure we do not merge until we have official signoff from GitHub"#
229+
)
230+
}
231+
187232
impl GitReference {
188-
pub fn resolve(&self, repo: &git2::Repository) -> CargoResult<git2::Oid> {
233+
/// Finds the Oid associated with the reference. The result is guaranteed to be on disk.
234+
/// But may not be the object the reference points to!
235+
/// For example, the reference points to a Commit and this may return the Tree that commit points do.
236+
pub fn resolve_to_object(&self, repo: &git2::Repository) -> CargoResult<git2::Oid> {
237+
// Check if Cargo has done a nonstandard shallow clone
238+
if let Some(shallow_data) = self.find_shallow_blob(repo) {
239+
Ok(shallow_data.tree)
240+
} else {
241+
self.resolve_by_git(repo)
242+
}
243+
}
244+
/// Finds the Oid of the Commit the reference points to. But the result may not be on disk!
245+
/// For example, the reference points to a Commit and we have only cloned the Tree.
246+
pub fn resolve_to_commit(&self, repo: &git2::Repository) -> CargoResult<git2::Oid> {
247+
// Check if Cargo has done a nonstandard shallow clone
248+
if let Some(shallow_data) = self.find_shallow_blob(repo) {
249+
return Ok(shallow_data.etag);
250+
} else {
251+
self.resolve_by_git(repo)
252+
}
253+
}
254+
255+
fn find_shallow_blob(&self, repo: &git2::Repository) -> Option<ShallowDataBlob> {
256+
repo.find_reference(
257+
&(format!(
258+
"refs/cargo-{}",
259+
serde_json::to_string(self).expect("why cant we make json of this")
260+
)),
261+
)
262+
.ok()
263+
.and_then(|re| re.peel_to_blob().ok())
264+
.and_then(|blob| serde_json::from_slice(blob.content()).ok())
265+
}
266+
267+
fn resolve_by_git(&self, repo: &git2::Repository) -> CargoResult<git2::Oid> {
189268
let id = match self {
190269
// Note that we resolve the named tag here in sync with where it's
191270
// fetched into via `fetch` below.
@@ -707,10 +786,17 @@ fn reset(repo: &git2::Repository, obj: &git2::Object<'_>, config: &Config) -> Ca
707786
opts.progress(|_, cur, max| {
708787
drop(pb.tick(cur, max, ""));
709788
});
710-
debug!("doing reset");
711-
repo.reset(obj, git2::ResetType::Hard, Some(&mut opts))?;
712-
debug!("reset done");
713-
Ok(())
789+
if obj.as_tree().is_some() {
790+
debug!("doing reset for Cargo nonstandard shallow clone");
791+
repo.checkout_tree(obj, Some(&mut opts))?;
792+
debug!("reset done");
793+
Ok(())
794+
} else {
795+
debug!("doing reset");
796+
repo.reset(obj, git2::ResetType::Hard, Some(&mut opts))?;
797+
debug!("reset done");
798+
Ok(())
799+
}
714800
}
715801

716802
pub fn with_fetch_options(
@@ -819,32 +905,44 @@ pub fn fetch(
819905
// The `+` symbol on the refspec means to allow a forced (fast-forward)
820906
// update which is needed if there is ever a force push that requires a
821907
// fast-forward.
822-
match reference {
823-
// For branches and tags we can fetch simply one reference and copy it
824-
// locally, no need to fetch other branches/tags.
825-
GitReference::Branch(b) => {
826-
refspecs.push(format!("+refs/heads/{0}:refs/remotes/origin/{0}", b));
827-
}
828-
GitReference::Tag(t) => {
829-
refspecs.push(format!("+refs/tags/{0}:refs/remotes/origin/tags/{0}", t));
908+
if let Some(oid_to_fetch) = oid_to_fetch {
909+
// GitHub told us exactly the min needed to fetch. So we can go ahead and do a Cargo nonstandard shallow clone.
910+
refspecs.push(format!("+{0}", oid_to_fetch));
911+
} else {
912+
// In some cases we have Cargo nonstandard shallow cloned this repo before, but cannot do it now.
913+
// Mostly if GitHub is now rate limiting us. If so, remove the info about the shallow clone.
914+
if let Ok(mut refe) = repo.find_reference(&format!(
915+
"refs/cargo-{}",
916+
serde_json::to_string(reference).expect("why cant we make json of this")
917+
)) {
918+
let _ = refe.delete();
830919
}
831920

832-
GitReference::DefaultBranch => {
833-
refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD"));
834-
}
921+
match reference {
922+
// For branches and tags we can fetch simply one reference and copy it
923+
// locally, no need to fetch other branches/tags.
924+
GitReference::Branch(b) => {
925+
refspecs.push(format!("+refs/heads/{0}:refs/remotes/origin/{0}", b));
926+
}
927+
GitReference::Tag(t) => {
928+
refspecs.push(format!("+refs/tags/{0}:refs/remotes/origin/tags/{0}", t));
929+
}
835930

836-
GitReference::Rev(rev) => {
837-
if rev.starts_with("refs/") {
838-
refspecs.push(format!("+{0}:{0}", rev));
839-
} else if let Some(oid_to_fetch) = oid_to_fetch {
840-
refspecs.push(format!("+{0}:refs/commit/{0}", oid_to_fetch));
841-
} else {
842-
// We don't know what the rev will point to. To handle this
843-
// situation we fetch all branches and tags, and then we pray
844-
// it's somewhere in there.
845-
refspecs.push(String::from("+refs/heads/*:refs/remotes/origin/*"));
931+
GitReference::DefaultBranch => {
846932
refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD"));
847-
tags = true;
933+
}
934+
935+
GitReference::Rev(rev) => {
936+
if rev.starts_with("refs/") {
937+
refspecs.push(format!("+{0}:{0}", rev));
938+
} else {
939+
// We don't know what the rev will point to. To handle this
940+
// situation we fetch all branches and tags, and then we pray
941+
// it's somewhere in there.
942+
refspecs.push(String::from("+refs/heads/*:refs/remotes/origin/*"));
943+
refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD"));
944+
tags = true;
945+
}
848946
}
849947
}
850948
}
@@ -1071,7 +1169,7 @@ fn github_fast_path(
10711169
return Ok(FastPathRev::Indeterminate);
10721170
}
10731171

1074-
let local_object = reference.resolve(repo).ok();
1172+
let local_object = reference.resolve_to_commit(repo).ok();
10751173

10761174
let github_branch_name = match reference {
10771175
GitReference::Branch(branch) => branch,
@@ -1141,7 +1239,7 @@ fn github_fast_path(
11411239
handle.useragent("cargo")?;
11421240
handle.http_headers({
11431241
let mut headers = List::new();
1144-
headers.append("Accept: application/vnd.github.3.sha")?;
1242+
headers.append("Accept: application/vnd.github+json")?;
11451243
if let Some(local_object) = local_object {
11461244
headers.append(&format!("If-None-Match: \"{}\"", local_object))?;
11471245
}
@@ -1161,8 +1259,44 @@ fn github_fast_path(
11611259
if response_code == 304 {
11621260
Ok(FastPathRev::UpToDate)
11631261
} else if response_code == 200 {
1164-
let oid_to_fetch = str::from_utf8(&response_body)?.parse::<Oid>()?;
1165-
Ok(FastPathRev::NeedsFetch(oid_to_fetch))
1262+
#[derive(Debug, Deserialize)]
1263+
struct GithubFastPathJsonResponse {
1264+
#[serde(serialize_with = "serialize_str")]
1265+
#[serde(deserialize_with = "deserialize_str")]
1266+
sha: git2::Oid,
1267+
commit: GithubCommitJsonResponse,
1268+
}
1269+
1270+
#[derive(Debug, Deserialize)]
1271+
struct GithubCommitJsonResponse {
1272+
tree: GithubTreeJsonResponse,
1273+
}
1274+
1275+
#[derive(Debug, Deserialize)]
1276+
struct GithubTreeJsonResponse {
1277+
#[serde(serialize_with = "serialize_str")]
1278+
#[serde(deserialize_with = "deserialize_str")]
1279+
sha: git2::Oid,
1280+
}
1281+
1282+
let data: GithubFastPathJsonResponse = serde_json::from_slice(&response_body)?;
1283+
// We can do a Cargo nonstandard shallow clone, so record the relevant information.
1284+
let bytes = serde_json::to_string(&ShallowDataBlob {
1285+
tree: data.commit.tree.sha,
1286+
etag: data.sha,
1287+
})
1288+
.expect("why cant we make json of this");
1289+
let shallow_blob = repo.blob(bytes.as_bytes())?;
1290+
repo.reference(
1291+
&format!(
1292+
"refs/cargo-{}",
1293+
serde_json::to_string(reference).expect("why cant we make json of this")
1294+
),
1295+
shallow_blob,
1296+
true,
1297+
"",
1298+
)?;
1299+
Ok(FastPathRev::NeedsFetch(data.commit.tree.sha))
11661300
} else {
11671301
// Usually response_code == 404 if the repository does not exist, and
11681302
// response_code == 422 if exists but GitHub is unable to resolve the

src/cargo/sources/registry/remote.rs

+5-15
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ pub struct RemoteRegistry<'cfg> {
2929
config: &'cfg Config,
3030
tree: RefCell<Option<git2::Tree<'static>>>,
3131
repo: LazyCell<git2::Repository>,
32-
head: Cell<Option<git2::Oid>>,
3332
current_sha: Cell<Option<InternedString>>,
3433
needs_update: bool, // Does this registry need to be updated?
3534
updated: bool, // Has this registry been updated this session?
@@ -46,7 +45,6 @@ impl<'cfg> RemoteRegistry<'cfg> {
4645
index_git_ref: GitReference::DefaultBranch,
4746
tree: RefCell::new(None),
4847
repo: LazyCell::new(),
49-
head: Cell::new(None),
5048
current_sha: Cell::new(None),
5149
needs_update: false,
5250
updated: false,
@@ -96,15 +94,6 @@ impl<'cfg> RemoteRegistry<'cfg> {
9694
})
9795
}
9896

99-
fn head(&self) -> CargoResult<git2::Oid> {
100-
if self.head.get().is_none() {
101-
let repo = self.repo()?;
102-
let oid = self.index_git_ref.resolve(repo)?;
103-
self.head.set(Some(oid));
104-
}
105-
Ok(self.head.get().unwrap())
106-
}
107-
10897
fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> {
10998
{
11099
let tree = self.tree.borrow();
@@ -113,8 +102,8 @@ impl<'cfg> RemoteRegistry<'cfg> {
113102
}
114103
}
115104
let repo = self.repo()?;
116-
let commit = repo.find_commit(self.head()?)?;
117-
let tree = commit.tree()?;
105+
let oid = self.index_git_ref.resolve_to_object(repo)?;
106+
let tree = repo.find_object(oid, None)?.peel_to_tree()?;
118107

119108
// Unfortunately in libgit2 the tree objects look like they've got a
120109
// reference to the repository object which means that a tree cannot
@@ -137,7 +126,9 @@ impl<'cfg> RemoteRegistry<'cfg> {
137126
if let Some(sha) = self.current_sha.get() {
138127
return Some(sha);
139128
}
140-
let sha = InternedString::new(&self.head().ok()?.to_string());
129+
let repo = self.repo().ok()?;
130+
let oid = self.index_git_ref.resolve_to_commit(repo).ok()?;
131+
let sha = InternedString::new(&oid.to_string());
141132
self.current_sha.set(Some(sha));
142133
Some(sha)
143134
}
@@ -277,7 +268,6 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
277268
self.config.http()?;
278269

279270
self.prepare()?;
280-
self.head.set(None);
281271
*self.tree.borrow_mut() = None;
282272
self.current_sha.set(None);
283273
let path = self.config.assert_package_cache_locked(&self.index_path);

0 commit comments

Comments
 (0)