huggingface
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/inference/README.md
Lines changed: 11 additions & 0 deletions b/‎packages/inference/README.md
Lines changed: 11 additions & 0 deletions
diff --git a/‎packages/inference/src/tasks/cv/imageToImage.ts
Lines changed: 83 additions & 0 deletions b/‎packages/inference/src/tasks/cv/imageToImage.ts
Lines changed: 83 additions & 0 deletions
diff --git a/‎packages/inference/src/tasks/index.ts
Lines changed: 1 addition & 0 deletions b/‎packages/inference/src/tasks/index.ts
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/inference/test/HfInference.spec.ts
Lines changed: 20 additions & 0 deletions b/‎packages/inference/test/HfInference.spec.ts
Lines changed: 20 additions & 0 deletions
diff --git a/‎packages/inference/test/bird_canny.png
28.4 KB b/‎packages/inference/test/bird_canny.png
28.4 KB
diff --git a/‎packages/inference/test/stormtrooper_depth.png
52.9 KB b/‎packages/inference/test/stormtrooper_depth.png
52.9 KB
diff --git a/‎packages/inference/vitest.config.ts
Lines changed: 8 additions & 1 deletion b/‎packages/inference/vitest.config.ts
Lines changed: 8 additions & 1 deletion
@@ -102,3 +102,5 @@ dist
 
 # TernJS port file
 .tern-port
+
+.DS_Store
@@ -170,6 +170,14 @@ await hf.imageToText({
   model: 'nlpconnect/vit-gpt2-image-captioning'
 })
 
+await hf.imageToImage({
+  inputs: readFileSync("test/stormtrooper_depth.png"),
+  parameters: {
+    prompt: "elmo's lecture",
+  },
+  model: "lllyasviel/sd-controlnet-depth",
+});
+
 // Multimodal
 
 await hf.visualQuestionAnswering({
@@ -260,12 +268,15 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
 - [x] Image segmentation
 - [x] Text to image
 - [x] Image to text - [demo](https://huggingface.co/spaces/huggingfacejs/image-to-text)
+- [x] Image to Image
 
 ### Multimodal
+
 - [x] Document question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
 - [x] Visual question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
 
 ### Tabular
+
 - [x] Tabular regression
 
 ## Tree-shaking
 
@@ -0,0 +1,83 @@
+import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { BaseArgs, Options, RequestArgs } from "../../types";
+import { request } from "../custom/request";
+import { base64FromBytes } from "@huggingface/shared";
+
+export type ImageToImageArgs = BaseArgs & {
+	/**
+	 * The initial image condition
+	 *
+	 **/
+	inputs: Blob | ArrayBuffer;
+
+	parameters?: {
+		/**
+		 * The text prompt to guide the image generation.
+		 */
+		prompt?: string;
+		/**
+		 * strengh param only works for SD img2img and alt diffusion img2img models
+		 * Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+		 * will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+		 * denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+		 * be maximum and the denoising process will run for the full number of iterations specified in
+		 * `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+		 **/
+		strength?: number;
+		/**
+		 * An optional negative prompt for the image generation
+		 */
+		negative_prompt?: string;
+		/**
+		 * The height in pixels of the generated image
+		 */
+		height?: number;
+		/**
+		 * The width in pixels of the generated image
+		 */
+		width?: number;
+		/**
+		 * The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
+		 */
+		num_inference_steps?: number;
+		/**
+		 * Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
+		 */
+		guidance_scale?: number;
+		/**
+		 * guess_mode only works for ControlNet models, defaults to False In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+		 * you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+		 */
+		guess_mode?: boolean;
+	};
+};
+
+export type ImageToImageOutput = Blob;
+
+/**
+ * This task reads some text input and outputs an image.
+ * Recommended model: lllyasviel/sd-controlnet-depth
+ */
+export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<ImageToImageOutput> {
+	let reqArgs: RequestArgs;
+	if (!args.parameters) {
+		reqArgs = {
+			accessToken: args.accessToken,
+			model: args.model,
+			data: args.inputs,
+		};
+	} else {
+		reqArgs = {
+			...args,
+			inputs: base64FromBytes(
+				new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
+			),
+		};
+	}
+	const res = await request<ImageToImageOutput>(reqArgs, options);
+	const isValidOutput = res && res instanceof Blob;
+	if (!isValidOutput) {
+		throw new InferenceOutputError("Expected Blob");
+	}
+	return res;
+}
@@ -13,6 +13,7 @@ export * from "./cv/imageSegmentation";
 export * from "./cv/imageToText";
 export * from "./cv/objectDetection";
 export * from "./cv/textToImage";
+export * from "./cv/imageToImage";
 
 // Natural Language Processing tasks
 export * from "./nlp/conversational";
 
@@ -431,6 +431,26 @@ describe.concurrent(
 				])
 			);
 		});
+		it("imageToImage", async () => {
+			const num_inference_steps = 25;
+
+			const res = await hf.imageToImage({
+				inputs: new Blob([readTestFile("stormtrooper_depth.png")], { type: "image / png" }),
+				parameters: {
+					prompt: "elmo's lecture",
+					num_inference_steps,
+				},
+				model: "lllyasviel/sd-controlnet-depth",
+			});
+			expect(res).toBeInstanceOf(Blob);
+		});
+		it("imageToImage blob data", async () => {
+			const res = await hf.imageToImage({
+				inputs: new Blob([readTestFile("bird_canny.png")], { type: "image / png" }),
+				model: "lllyasviel/sd-controlnet-canny",
+			});
+			expect(res).toBeInstanceOf(Blob);
+		});
 		it("textToImage", async () => {
 			const res = await hf.textToImage({
 				inputs: "award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]",
 
@@ -3,7 +3,14 @@ import { readFileSync } from "node:fs";
 import { join } from "node:path";
 
 // make local test files available in browser by preloading their contents
-const testFilesToPreload = ["cheetah.png", "cats.png", "sample1.flac", "invoice.png"];
+const testFilesToPreload = [
+	"cheetah.png",
+	"cats.png",
+	"sample1.flac",
+	"invoice.png",
+	"stormtrooper_depth.png",
+	"bird_canny.png",
+];
 const testFilesContents: Record<string, string> = {};
 for (const filename of testFilesToPreload) {
 	testFilesContents[filename] = readFileSync(join(__dirname, "test", filename)).toString("base64");