8000 (hub) Compute sha in a streaming fashion (#10) · huggingface/huggingface.js@0f475e9 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0f475e9

Browse files
authored
(hub) Compute sha in a streaming fashion (#10)
- Compute sha in a streaming fashion - Remove support for non-Blobs in `commit` - Add a dependency `hash-wasm` for files > 10 MB on browser - Split utils into multiple files
1 parent 7c7e3db commit 0f475e9

18 files changed

+241
-238
lines changed

packages/hub/.eslintrc.cjs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ module.exports = {
99
node: true,
1010
},
1111
rules: {
12+
"no-constant-condition": "off",
1213
"@typescript-eslint/no-empty-function": "off",
1314
"@typescript-eslint/explicit-module-boundary-types": "error",
1415
"@typescript-eslint/consistent-type-imports": "error",

packages/hub/README.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ Official utilities to use the Hugging Face hub API
88
npm add @huggingface/hub
99
```
1010

11-
See also [TimMikeladze/huggingface](https://github.com/TimMikeladze/huggingface) for a non-official wrapper to use the inference API
12-
1311
## API
1412

1513
```ts
@@ -28,8 +26,6 @@ await commit({
2826
{
2927
operation: "addOrUpdate",
3028
path: "file.txt",
31-
// or new TextEncoder().encode("Hello World")
32-
// or Buffer.from("Hello world")
3329
content: new Blob(["Hello World"]),
3430
},
3531
],
@@ -40,3 +36,13 @@ await (await downloadFile({ repo, path: "README.md" })).text();
4036

4137
await deleteRepo({ repo, credentials });
4238
```
39+
40+
## Performance considerations
41+
42+
When uploading large files, you may want to run the `commit` calls inside a worker, to offload the sha256 computations.
43+
44+
Also, use `Blob` to avoid loading the whole files in RAM. In `Node`, it's up to you to provide a smart `Blob` wrapper around your file. Feel free to open an issue if you want *us* to provide the smart `Blob` implementation.
45+
46+
## Dependencies
47+
48+
- `hash-wasm` : Only used in the browser, when committing files over 10 MB. Browsers do not natively support streaming sha256 computations.

packages/hub/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,8 @@
5050
"tsup": "^6.6.3",
5151
"type-fest": "^3.5.6",
5252
"typescript": "^4.9.5"
53+
},
54+
"dependencies": {
55+
"hash-wasm": "^4.9.0"
5356
}
5457
}

packages/hub/pnpm-lock.yaml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/hub/src/lib/commit.spec.ts

Lines changed: 8 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -7,83 +7,12 @@ import { createRepo } from "./create-repo";
77
import { deleteRepo } from "./delete-repo";
88
import { downloadFile } from "./download-file";
99

10-
describe("commit", () => {
11-
it("should commit to a repo with buffers", async () => {
12-
const repoName = `${TEST_USER}/TEST-${randomBytes(10).toString("hex")}`;
13-
const repo: RepoId = {
14-
name: repoName,
15-
type: "model",
16-
};
17-
18-
await createRepo({
19-
credentials: {
20-
accessToken: TEST_ACCESS_TOKEN,
21-
},
22-
repo,
23-
license: "mit",
24-
});
10+
const lfsContent = "O123456789".repeat(100_000);
2511

26-
const readme1 = await downloadFile({ repo, path: "README.md" });
27-
assert.strictEqual(readme1?.status, 200);
28-
29-
try {
30-
await commit({
31-
repo,
32-
title: "Some commit",
33-
credentials: {
34-
accessToken: TEST_ACCESS_TOKEN,
35-
},
36-
operations: [
37-
{
38-
operation: "addOrUpdate",
39-
content: Buffer.from("This is me"),
40-
path: "test.txt",
41-
},
42-
{
43-
operation: "addOrUpdate",
44-
content: Buffer.from("This is a LFS file"),
45-
path: "test.lfs.txt",
46-
},
47-
{
48-
operation: "delete",
49-
path: "README.md",
50-
},
51-
],
52-
});
53-
54-
const fileContent = await downloadFile({ repo, path: "test.txt" });
55-
assert.strictEqual(fileContent?.status, 200);
56-
assert.strictEqual(await fileContent?.text(), "This is me");
57-
58-
const lfsFileContent = await downloadFile({ repo, path: "test.lfs.txt" });
59-
assert.strictEqual(lfsFileContent?.status, 200);
60-
assert.strictEqual(await lfsFileContent?.text(), "This is a LFS file");
61-
62-
const lfsFilePointer = await fetch(`${HUB_URL}/${repoName}/raw/main/test.lfs.txt`);
63-
assert.strictEqual(lfsFilePointer.status, 200);
64-
assert.strictEqual(
65-
(await lfsFilePointer.text()).trim(),
66-
`
67-
version https://git-lfs.github.com/spec/v1
68-
oid sha256:7ee757a47707069c6016b2751fdc7cbe4ed151530d9039cf99f6f6921509aa05
69-
size 18
70-
`.trim()
71-
);
72-
73-
const readme2 = await downloadFile({ repo, path: "README.md" });
74-
assert.strictEqual(readme2, null);
75-
} finally {
76-
await deleteRepo({
77-
repo: {
78-
name: repoName,
79-
type: "model",
80-
},
81-
credentials: { accessToken: TEST_ACCESS_TOKEN },
82-
});
83-
}
84-
});
12+
describe("commit", () => {
13+
it("should commit to a repo with blobs", async function () {
14+
this.timeout(30_000);
8515

86-
it("should commit to a repo with blobs", async () => {
8716
const repoName = `${TEST_USER}/TEST-${randomBytes(10).toString("hex")}`;
8817
const repo: RepoId = {
8918
name: repoName,
@@ -116,7 +45,7 @@ size 18
11645
},
11746
{
11847
operation: "addOrUpdate",
119-
content: new Blob(["This is a LFS file"]),
48+
content: new Blob([lfsContent]),
12049
path: "test.lfs.txt",
12150
},
12251
{
@@ -132,16 +61,16 @@ size 18
13261

13362
const lfsFileContent = await downloadFile({ repo, path: "test.lfs.txt" });
13463
assert.strictEqual(lfsFileContent?.status, 200);
135-
assert.strictEqual(await lfsFileContent?.text(), "This is a LFS file");
64+
assert.strictEqual(await lfsFileContent?.text(), lfsContent);
13665

13766
const lfsFilePointer = await fetch(`${HUB_URL}/${repoName}/raw/main/test.lfs.txt`);
13867
assert.strictEqual(lfsFilePointer.status, 200);
13968
assert.strictEqual(
14069
(await lfsFilePointer.text()).trim(),
14170
`
14271
version https://git-lfs.github.com/spec/v1
143-
oid sha256:7ee757a47707069c6016b2751fdc7cbe4ed151530d9039cf99f6f6921509aa05
144-
size 18
72+
oid sha256:a3bbce7ee1df7233d85b5f4d60faa3755f93f537804f8b540c72b0739239ddf8
73+
size ${lfsContent.length}
14574
`.trim()
14675
);
14776

packages/hub/src/lib/commit.ts

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ export type CommitDeletedEntry = {
2222
path: string;
2323
};
2424

25-
type ContentSource = ArrayBuffer | Blob; // Todo: support web streams
25+
type ContentSource = Blob; // Todo: offer a smart Blob wrapper around (filePath + size) for Node.js
2626

2727
export type CommitFile = {
2828
operation: "addOrUpdate";
@@ -73,22 +73,6 @@ function isFileOperation(op: CommitOperation): op is CommitFile {
7373
return op.operation === "addOrUpdate";
7474
}
7575

76-
async function toString(source: ContentSource) {
77-
return source instanceof Blob ? await source.text() : new TextDecoder("utf-8").decode(source);
78-
}
79-
80-
function byteLength(source: ContentSource) {
81-
return source instanceof Blob ? source.size : source.byteLength;
82-
}
83-
84-
async function sample(source: ContentSource) {
85-
return source instanceof Blob ? await source.slice(0, 512).arrayBuffer() : source.slice(0, 512);
86-
}
87-
88-
async function toArrayBuffer(source: ContentSource): Promise<ArrayBuffer> {
89-
return source instanceof Blob ? await source.arrayBuffer() : source;
90-
}
91-
9276
/**
9377
* Internal function for now, used by commit.
9478
*
@@ -105,12 +89,12 @@ async function* commitIter(params: CommitParams): AsyncGenerator<unknown, Commit
10589

10690
for (const operations of chunk(params.operations.filter(isFileOperation), 100)) {
10791
const payload: ApiPreuploadRequest = {
108-
gitAttributes: gitAttributes && (await toString(gitAttributes)),
92+
gitAttributes: gitAttributes && (await gitAttributes.text()),
10993
files: await Promise.all(
11094
operations.map(async (operation) => ({
11195
path: operation.path,
112-
size: byteLength(operation.content),
113-
sample: base64FromBytes(new Uint8Array(await sample(operation.content))),
96+
size: operation.content.size,
97+
sample: base64FromBytes(new Uint8Array(await operation.content.slice(0, 512).arrayBuffer())),
11498
}))
11599
),
116100
};
@@ -152,7 +136,7 @@ async function* commitIter(params: CommitParams): AsyncGenerator<unknown, Commit
152136

153137
const shas = await promisesQueue(
154138
operations.map((op) => async () => {
155-
const sha = await sha256(await toArrayBuffer(op.content));
139+
const sha = await sha256(op.content);
156140
lfsShas.set(op.path, sha);
157141
return sha;
158142
}),
@@ -169,7 +153,7 @@ async function* commitIter(params: CommitParams): AsyncGenerator<unknown, Commit
169153
},
170154
objects: operations.map((op, i) => ({
171155
oid: shas[i],
172-
size: byteLength(op.content),
156+
size: op.content.size,
173157
})),
174158
};
175159

@@ -221,7 +205,7 @@ async function* commitIter(params: CommitParams): AsyncGenerator<unknown, Commit
221205
const completionUrl = obj.actions.upload.href;
222206
const parts = Object.keys(header).filter((key) => /^[0-9]+$/.test(key));
223207

224-
if (parts.length !== Math.ceil(byteLength(content) / chunkSize)) {
208+
if (parts.length !== Math.ceil(content.length / chunkSize)) {
225209
throw new Error("Invalid server response to upload large LFS file, wrong number of parts");
226210
}
227211

@@ -326,7 +310,7 @@ async function* commitIter(params: CommitParams): AsyncGenerator<unknown, Commit
326310
value: {
327311
path: operation.path,
328312
algo: "sha256",
329-
size: byteLength(operation.content),
313+
size: operation.content.length,
330314
oid: lfsShas.get(operation.path)!,
331315
} satisfies ApiCommitLfsFile,
332316
}
@@ -371,7 +355,7 @@ async function convertOperationToNdJson(operation: CommitOperation): Promise<Api
371355
return {
372356
key: "file",
373357
value: {
374-
content: base64FromBytes(new Uint8Array(await toArrayBuffer(operation.content))),
358+
content: base64FromBytes(new Uint8Array(await operation.content.arrayBuffer())),
375359
path: operation.path,
376360
encoding: "base64",
377361
},

0 commit comments

Comments
 (0)
0