8000 Img2img task (#175) · huggingface/huggingface.js@a691796 · GitHub
[go: up one dir, main page]

Skip to content

Commit a691796

Browse files
authored
Img2img task (#175)
1 parent 29fd5b2 commit a691796

File tree

8 files changed

+125
-1
lines changed

8 files changed

+125
-1
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,5 @@ dist
102102

103103
# TernJS port file
104104
.tern-port
105+
106+
.DS_Store

packages/inference/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,14 @@ await hf.imageToText({
170170
model: 'nlpconnect/vit-gpt2-image-captioning'
171171
})
172172

173+
await hf.imageToImage({
174+
inputs: readFileSync("test/stormtrooper_depth.png"),
175+
parameters: {
176+
prompt: "elmo's lecture",
177+
},
178+
model: "lllyasviel/sd-controlnet-depth",
179+
});
180+
173181
// Multimodal
174182

175183
await hf.visualQuestionAnswering({
@@ -260,12 +268,15 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
260268
- [x] Image segmentation
261269
- [x] Text to image
262270
- [x] Image to text - [demo](https://huggingface.co/spaces/huggingfacejs/image-to-text)
271+
- [x] Image to Image
263272

264273
### Multimodal
274+
265275
- [x] Document question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
266276
- [x] Visual question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
267277

268278
### Tabular
279+
269280
- [x] Tabular regression
270281

271282
## Tree-shaking
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import { InferenceOutputError } from "../../lib/InferenceOutputError";
2+
import type { BaseArgs, Options, RequestArgs } from "../../types";
3+
import { request } from "../custom/request";
4+
import { base64FromBytes } from "@huggingface/shared";
5+
6+
export type ImageToImageArgs = BaseArgs & {
7+
/**
8+
* The initial image condition
9+
*
10+
**/
11+
inputs: Blob | ArrayBuffer;
12+
13+
parameters?: {
14+
/**
15+
* The text prompt to guide the image generation.
16+
*/
17+
prompt?: string;
18+
/**
19+
* strengh param only works for SD img2img and alt diffusion img2img models
20+
* Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
21+
* will be used as a starting point, adding more noise to it the larger the `strength`. The number of
22+
* denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
23+
* be maximum and the denoising process will run for the full number of iterations specified in
24+
* `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
25+
**/
26+
strength?: number;
27+
/**
28+
* An optional negative prompt for the image generation
29+
*/
30+
negative_prompt?: string;
31+
/**
32+
* The height in pixels of the generated image
33+
*/
34+
height?: number;
35+
/**
36+
* The width in pixels of the generated image
37+
*/
38+
width?: number;
39+
/**
40+
* The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
41+
*/
42+
num_inference_steps?: number;
43+
/**
44+
* Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
45+
*/
46+
guidance_scale?: number;
47+
/**
48+
* guess_mode only works for ControlNet models, defaults to False In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
49+
* you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
50+
*/
51+
guess_mode?: boolean;
52+
};
53+
};
54+
55+
export type ImageToImageOutput = Blob;
56+
57+
/**
58+
* This task reads some text input and outputs an image.
59+
* Recommended model: lllyasviel/sd-controlnet-depth
60+
*/
61+
export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<ImageToImageOutput> {
62+
let reqArgs: RequestArgs;
63+
if (!args.parameters) {
64+
reqArgs = {
65+
accessToken: args.accessToken,
66+
model: args.model,
67+
data: args.inputs,
68+
};
69+
} else {
70+
reqArgs = {
71+
...args,
72+
inputs: base64FromBytes(
73+
new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
74+
),
75+
};
76+
}
77+
const res = await request<ImageToImageOutput>(reqArgs, options);
78+
const isValidOutput = res && res instanceof Blob;
79+
if (!isValidOutput) {
80+
throw new InferenceOutputError("Expected Blob");
81+
}
82+
return res;
83+
}

packages/inference/src/tasks/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export * from "./cv/imageSegmentation";
1313
export * from "./cv/imageToText";
1414
export * from "./cv/objectDetection";
1515
export * from "./cv/textToImage";
16+
export * from "./cv/imageToImage";
1617

1718
// Natural Language Processing tasks
1819
export * from "./nlp/conversational";

packages/inference/test/HfInference.spec.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,26 @@ describe.concurrent(
431431
])
432432
);
433433
});
434+
it("imageToImage", async () => {
435+
const num_inference_steps = 25;
436+
437+
const res = await hf.imageToImage({
438+
inputs: new Blob([readTestFile("stormtrooper_depth.png")], { type: "image / png" }),
439+
parameters: {
440+
prompt: "elmo's lecture",
441+
num_inference_steps,
442+
},
443+
model: "lllyasviel/sd-controlnet-depth",
444+
});
445+
expect(res).toBeInstanceOf(Blob);
446+
});
447+
it("imageToImage blob data", async () => {
448+
const res = await hf.imageToImage({
449+
inputs: new Blob([readTestFile("bird_canny.png")], { type: "image / png" }),
450+
model: "lllyasviel/sd-controlnet-canny",
451+
});
452+
expect(res).toBeInstanceOf(Blob);
453+
});
434454
it("textToImage", async () => {
435455
const res = await hf.textToImage({
436456
inputs: "award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]",
28.4 KB
Loading
52.9 KB
Loading

packages/inference/vitest.config.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,14 @@ import { readFileSync } from "node:fs";
33
import { join } from "node:path";
44

55
// make local test files available in browser by preloading their contents
6-
const testFilesToPreload = ["cheetah.png", "cats.png", "sample1.flac", "invoice.png"];
6+
const testFilesToPreload = [
7+
"cheetah.png",
8+
"cats.png",
9+
"sample1.flac",
10+
"invoice.png",
11+
"stormtrooper_depth.png",
12+
"bird_canny.png",
13+
];
714
const testFilesContents: Record<string, string> = {};
815
for (const filename of testFilesToPreload) {
916
testFilesContents[filename] = readFileSync(join(__dirname, "test", filename)).toString("base64");

0 commit comments

Comments
 (0)
0