Skip to content

Commit a27b5b4

Browse files
authored
Download using xet (#1305)
cc @hanouticelina @Wauplin @bpronan @assafvayner `downloadFile` has a `xet: true` optional param, that when set will download the file with the xet protocol if possible. ## Breaking changes - `downloadFile` returns a `Blob` - `fileDownloadInfo`'s return format is changed: - `downloadLink` => `url`, and it's present every time - optional `xet` prop for xet files ## Concerns https://huggingface.co/spaces/coyotte508/hub-xet uses quite a bit of CPU when downloading a full xet file, maybe we need to introduce multi-threading (with workers?) or optimize perf Especially since we use it to parse safetensors data now cc @Kakulukian ## Performance work Before releasing out of experimental, some things need to be addressed & tested on different engines (node, browser): - CPU usage, eg how fast it is to de-chunk 10GB of local data. Can it be significantly improved with moving some of the code to WASM. - Stream backpressure, how much RAM does get used, especially due to different handling of Web streams, and if we can make it consistent across - Http calls are sequentially made, to save RAM, but it can hurt in high-ping high-bandwidth situations. Not a problem if we download multiple files in parallel, but when downloading one large file, we probably don't care about reading data on the wire and can just promise-queue a bunch of http calls and use some RAM. We can probably change the behavior of the lib depending on the downloaded file's size. Being able to upload xet files would be nice too, experimentally as well (eg for hub-ci tests)
1 parent b578227 commit a27b5b4

21 files changed

+367
-310
lines changed

CONTRIBUTING.md

+1-3
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ It's not a hard requirement, but please consider using an icon from [Gitmoji](ht
2020

2121
If you want to run only specific tests, you can do `pnpm test -- -t "test name"`.
2222

23-
You can also do `npx vitest ./packages/hub/src/utils/XetBlob.spec.ts` to run a specific test file.
24-
25-
Or `cd packages/hub && npx vitest --browser.name=chrome --browser.headless --config vitest-browser.config.mts ./src/utils/XetBlob.spec.ts` to run browser tests on a specific file
23+
You can also do `pnpm --filter hub test ./src/utils/XetBlob.spec.ts` to run a specific test file.
2624

2725
## Adding a package
2826

e2e/ts/.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
package-lock.json
1+
package-lock.json
2+
pnpm-lock.yaml

packages/hub/package.json

-3
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@
5959
],
6060
"author": "Hugging Face",
6161
"license": "MIT",
62-
"devDependencies": {
63-
"@types/node": "^20.11.28"
64-
},
6562
"dependencies": {
6663
"@huggingface/tasks": "workspace:^"
6764
}

packages/hub/pnpm-lock.yaml

-17
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/hub/src/lib/commit.spec.ts

+1-5
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ describe("commit", () => {
3333

3434
try {
3535
const readme1 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });
36-
assert.strictEqual(readme1?.status, 200);
36+
assert(readme1, "Readme doesn't exist");
3737

3838
const nodeOperation: CommitFile[] = isFrontend
3939
? []
@@ -77,11 +77,9 @@ describe("commit", () => {
7777
});
7878

7979
const fileContent = await downloadFile({ repo, path: "test.txt", hubUrl: TEST_HUB_URL });
80-
assert.strictEqual(fileContent?.status, 200);
8180
assert.strictEqual(await fileContent?.text(), "This is me");
8281

8382
const lfsFileContent = await downloadFile({ repo, path: "test.lfs.txt", hubUrl: TEST_HUB_URL });
84-
assert.strictEqual(lfsFileContent?.status, 200);
8583
assert.strictEqual(await lfsFileContent?.text(), lfsContent);
8684

8785
const lfsFileUrl = `${TEST_HUB_URL}/${repoName}/raw/main/test.lfs.txt`;
@@ -98,15 +96,13 @@ size ${lfsContent.length}
9896

9997
if (!isFrontend) {
10098
const fileUrlContent = await downloadFile({ repo, path: "tsconfig.json", hubUrl: TEST_HUB_URL });
101-
assert.strictEqual(fileUrlContent?.status, 200);
10299
assert.strictEqual(
103100
await fileUrlContent?.text(),
104101
(await import("node:fs")).readFileSync("./tsconfig.json", "utf-8")
105102
);
106103
}
107104

108105
const webResourceContent = await downloadFile({ repo, path: "lamaral.json", hubUrl: TEST_HUB_URL });
109-
assert.strictEqual(webResourceContent?.status, 200);
110106
assert.strictEqual(await webResourceContent?.text(), await (await fetch(tokenizerJsonUrl)).text());
111107

112108
const readme2 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });

packages/hub/src/lib/download-file-to-cache-dir.spec.ts

+22-9
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,26 @@
11
import { expect, test, describe, vi, beforeEach } from "vitest";
22
import type { RepoDesignation, RepoId } from "../types/public";
33
import { dirname, join } from "node:path";
4-
import { lstat, mkdir, stat, symlink, writeFile, rename } from "node:fs/promises";
4+
import { lstat, mkdir, stat, symlink, rename } from "node:fs/promises";
55
import { pathsInfo } from "./paths-info";
6-
import type { Stats } from "node:fs";
6+
import { createWriteStream, type Stats } from "node:fs";
77
import { getHFHubCachePath, getRepoFolderName } from "./cache-management";
88
import { toRepoId } from "../utils/toRepoId";
99
import { downloadFileToCacheDir } from "./download-file-to-cache-dir";
1010
import { createSymlink } from "../utils/symlink";
1111

1212
vi.mock("node:fs/promises", () => ({
13-
writeFile: vi.fn(),
1413
rename: vi.fn(),
1514
symlink: vi.fn(),
1615
lstat: vi.fn(),
1716
mkdir: vi.fn(),
1817
stat: vi.fn(),
1918
}));
2019

20+
vi.mock("node:fs", () => ({
21+
createWriteStream: vi.fn(),
22+
}));
23+
2124
vi.mock("./paths-info", () => ({
2225
pathsInfo: vi.fn(),
2326
}));
@@ -63,11 +66,15 @@ describe("downloadFileToCacheDir", () => {
6366
beforeEach(() => {
6467
vi.resetAllMocks();
6568
// mock 200 request
66-
vi.mocked(fetchMock).mockResolvedValue({
67-
status: 200,
68-
ok: true,
69-
body: "dummy-body",
70-
} as unknown as Response);
69+
vi.mocked(fetchMock).mockResolvedValue(
70+
new Response("dummy-body", {
71+
status: 200,
72+
headers: {
73+
etag: DUMMY_ETAG,
74+
"Content-Range": "bytes 0-54/55",
75+
},
76+
})
77+
);
7178

7279
// prevent to use caching
7380
vi.mocked(stat).mockRejectedValue(new Error("Do not exists"));
@@ -235,6 +242,9 @@ describe("downloadFileToCacheDir", () => {
235242
},
236243
]);
237244

245+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
246+
vi.mocked(createWriteStream).mockReturnValue(async function* () {} as any);
247+
238248
const output = await downloadFileToCacheDir({
239249
repo: DUMMY_REPO,
240250
path: "/README.md",
@@ -276,6 +286,9 @@ describe("downloadFileToCacheDir", () => {
276286
},
277287
]);
278288

289+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
290+
vi.mocked(createWriteStream).mockReturnValue(async function* () {} as any);
291+
279292
await downloadFileToCacheDir({
280293
repo: DUMMY_REPO,
281294
path: "/README.md",
@@ -284,7 +297,7 @@ describe("downloadFileToCacheDir", () => {
284297

285298
const incomplete = `${expectedBlob}.incomplete`;
286299
// 1. should write fetch#response#body to incomplete file
287-
expect(writeFile).toHaveBeenCalledWith(incomplete, "dummy-body");
300+
expect(createWriteStream).toHaveBeenCalledWith(incomplete);
288301
// 2. should rename the incomplete to the blob expected name
289302
expect(rename).toHaveBeenCalledWith(incomplete, expectedBlob);
290303
// 3. should create symlink pointing to blob

packages/hub/src/lib/download-file-to-cache-dir.ts

+10-5
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
import { getHFHubCachePath, getRepoFolderName } from "./cache-management";
22
import { dirname, join } from "node:path";
3-
import { writeFile, rename, lstat, mkdir, stat } from "node:fs/promises";
3+
import { rename, lstat, mkdir, stat } from "node:fs/promises";
44
import type { CommitInfo, PathInfo } from "./paths-info";
55
import { pathsInfo } from "./paths-info";
66
import type { CredentialsParams, RepoDesignation } from "../types/public";
77
import { toRepoId } from "../utils/toRepoId";
88
import { downloadFile } from "./download-file";
99
import { createSymlink } from "../utils/symlink";
10+
import { Readable } from "node:stream";
11+
import type { ReadableStream } from "node:stream/web";
12+
import { pipeline } from "node:stream/promises";
13+
import { createWriteStream } from "node:fs";
1014

1115
export const REGEX_COMMIT_HASH: RegExp = new RegExp("^[0-9a-f]{40}$");
1216

@@ -115,15 +119,16 @@ export async function downloadFileToCacheDir(
115119
const incomplete = `${blobPath}.incomplete`;
116120
console.debug(`Downloading ${params.path} to ${incomplete}`);
117121

118-
const response: Response | null = await downloadFile({
122+
const blob: Blob | null = await downloadFile({
119123
...params,
120124
revision: commitHash,
121125
});
122126

123-
if (!response || !response.ok || !response.body) throw new Error(`invalid response for file ${params.path}`);
127+
if (!blob) {
128+
throw new Error(`invalid response for file ${params.path}`);
129+
}
124130

125-
// @ts-expect-error resp.body is a Stream, but Stream in internal to node
126-
await writeFile(incomplete, response.body);
131+
await pipeline(Readable.fromWeb(blob.stream() as ReadableStream), createWriteStream(incomplete));
127132

128133
// rename .incomplete file to expect blob
129134
await rename(incomplete, blobPath);
+66-49
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,82 @@
1-
import { expect, test, describe, vi } from "vitest";
1+
import { expect, test, describe, assert } from "vitest";
22
import { downloadFile } from "./download-file";
3-
import type { RepoId } from "../types/public";
4-
5-
const DUMMY_REPO: RepoId = {
6-
name: "hello-world",
7-
type: "model",
8-
};
3+
import { deleteRepo } from "./delete-repo";
4+
import { createRepo } from "./create-repo";
5+
import { TEST_ACCESS_TOKEN, TEST_HUB_URL, TEST_USER } from "../test/consts";
6+
import { insecureRandomString } from "../utils/insecureRandomString";
97

108
describe("downloadFile", () => {
11-
test("hubUrl params should overwrite HUB_URL", async () => {
12-
const fetchMock: typeof fetch = vi.fn();
13-
vi.mocked(fetchMock).mockResolvedValue({
14-
status: 200,
15-
ok: true,
16-
} as Response);
9+
test("should download regular file", async () => {
10+
const blob = await downloadFile({
11+
repo: {
12+
type: "model",
13+
name: "openai-community/gpt2",
14+
},
15+
path: "README.md",
16+
});
17+
18+
const text = await blob?.slice(0, 1000).text();
19+
assert(
20+
text?.includes(`---
21+
language: en
22+
tags:
23+
- exbert
24+
25+
license: mit
26+
---
27+
28+
29+
# GPT-2
1730
18-
await downloadFile({
19-
repo: DUMMY_REPO,
20-
path: "/README.md",
21-
hubUrl: "http://dummy-hub",
22-
fetch: fetchMock,
31+
Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large`)
32+
);
33+
});
34+
test("should downoad xet file", async () => {
35+
const blob = await downloadFile({
36+
repo: {
37+
type: "model",
38+
name: "celinah/xet-experiments",
39+
},
40+
path: "large_text.txt",
2341
});
2442

25-
expect(fetchMock).toHaveBeenCalledWith("http://dummy-hub/hello-world/resolve/main//README.md", expect.anything());
43+
const text = await blob?.slice(0, 100).text();
44+
expect(text).toMatch("this is a text file.".repeat(10).slice(0, 100));
2645
});
2746

28-
test("raw params should use raw url", async () => {
29-
const fetchMock: typeof fetch = vi.fn();
30-
vi.mocked(fetchMock).mockResolvedValue({
31-
status: 200,
32-
ok: true,
33-
} as Response);
47+
test("should download private file", async () => {
48+
const repoName = `datasets/${TEST_USER}/TEST-${insecureRandomString()}`;
3449

35-
await downloadFile({
36-
repo: DUMMY_REPO,
37-
path: "README.md",
38-
raw: true,
39-
fetch: fetchMock,
50+
const result = await createRepo({
51+
accessToken: TEST_ACCESS_TOKEN,
52+
hubUrl: TEST_HUB_URL,
53+
private: true,
54+
repo: repoName,
55+
files: [{ path: ".gitattributes", content: new Blob(["*.html filter=lfs diff=lfs merge=lfs -text"]) }],
4056
});
4157

42-
expect(fetchMock).toHaveBeenCalledWith("https://huggingface.co/hello-world/raw/main/README.md", expect.anything());
43-
});
58+
assert.deepStrictEqual(result, {
59+
repoUrl: `${TEST_HUB_URL}/${repoName}`,
60+
});
61+
62+
try {
63+
const blob = await downloadFile({
64+
repo: repoName,
65+
path: ".gitattributes",
66+
hubUrl: TEST_HUB_URL,
67+
accessToken: TEST_ACCESS_TOKEN,
68+
});
4469

45-
test("internal server error should propagate the error", async () => {
46-
const fetchMock: typeof fetch = vi.fn();
47-
vi.mocked(fetchMock).mockResolvedValue({
48-
status: 500,
49-
ok: false,
50-
headers: new Map<string, string>([["Content-Type", "application/json"]]),
51-
json: () => ({
52-
error: "Dummy internal error",
53-
}),
54-
} as unknown as Response);
70+
assert(blob, "File should be found");
5571

56-
await expect(async () => {
57-
await downloadFile({
58-
repo: DUMMY_REPO,
59-
path: "README.md",
60-
raw: true,
61-
fetch: fetchMock,
72+
const text = await blob?.text();
73+
assert.strictEqual(text, "*.html filter=lfs diff=lfs merge=lfs -text");
74+
} finally {
75+
await deleteRepo({
76+
repo: repoName,
77+
hubUrl: TEST_HUB_URL,
78+
accessToken: TEST_ACCESS_TOKEN,
6279
});
63-
}).rejects.toThrowError("Dummy internal error");
80+
}
6481
});
6582
});

0 commit comments

Comments
 (0)