✨ Text-to-speech task (#167)

vvmnnnkv · web-flow · commit 40ec8e56a47f · 2023-04-29T17:57:29.000+02:00
diff --git a/docs/inference/README.md b/docs/inference/README.md
@@ -138,6 +138,11 @@ await hf.audioClassification({
   data: readFileSync('test/sample1.flac')
 })
 
+await hf.textToSpeech({
+  model: 'espnet/kan-bayashi_ljspeech_vits',
+  inputs: 'hello human'
+})
+
 // Computer Vision
 
 await hf.imageClassification({
@@ -234,6 +239,7 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
 
 - [x] Automatic speech recognition
 - [x] Audio classification
+- [x] Text-to-Speech
 
 ### Computer Vision
 
diff --git a/packages/inference/src/tasks/audio/textToSpeech.ts b/packages/inference/src/tasks/audio/textToSpeech.ts
@@ -0,0 +1,25 @@
+import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { BaseArgs, Options } from "../../types";
+import { request } from "../custom/request";
+
+export type TextToSpeechArgs = BaseArgs & {
+	/**
+	 * The text to generate an audio from
+	 */
+	inputs: string;
+};
+
+export type TextToSpeechOutput = Blob;
+
+/**
+ * This task synthesize an audio of a voice pronouncing a given text.
+ * Recommended model: espnet/kan-bayashi_ljspeech_vits
+ */
+export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<TextToSpeechOutput> {
+	const res = await request<TextToSpeechOutput>(args, options);
+	const isValidOutput = res && res instanceof Blob;
+	if (!isValidOutput) {
+		throw new InferenceOutputError("Expected Blob");
+	}
+	return res;
+}
diff --git a/packages/inference/src/tasks/index.ts b/packages/inference/src/tasks/index.ts
@@ -5,6 +5,7 @@ export * from "./custom/streamingRequest";
 // Audio tasks
 export * from "./audio/audioClassification";
 export * from "./audio/automaticSpeechRecognition";
+export * from "./audio/textToSpeech";
 
 // Commputer Vision tasks
 export * from "./cv/imageClassification";
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
@@ -357,6 +357,14 @@ describe.concurrent(
 				])
 			);
 		});
+		it("textToSpeech", async () => {
+			expect(
+				await hf.textToSpeech({
+					model: "espnet/kan-bayashi_ljspeech_vits",
+					inputs: "hello there!",
+				})
+			).toBeInstanceOf(Blob);
+		});
 		it("imageClassification", async () => {
 			expect(
 				await hf.imageClassification({
diff --git a/packages/inference/test/tapes.json b/packages/inference/test/tapes.json
@@ -604,5 +604,29 @@
         "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
       }
     }
+  },
+  "fedb289665ec1890b722566764b838adcb7516d1bd5b18c0d777b8ed21e16034": {
+    "url": "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":\"hello there!\",\"options\":{}}"
+    },
+    "response": {
+      "body": "",
+      "status": 200,
+      "statusText": "OK",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "access-control-expose-headers": "x-compute-type, x-compute-time",
+        "connection": "keep-alive",
+        "content-type": "audio/flac",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Accept-Encoding, Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
   }
 }