wip

SBrandeis · SBrandeis · commit 699140edf11d · 2025-01-20T18:32:23.000+01:00
diff --git a/packages/inference/src/tasks/audio/textToSpeech.ts b/packages/inference/src/tasks/audio/textToSpeech.ts
@@ -1,13 +1,10 @@
-import type { TextToSpeechInput } from "@huggingface/tasks";
+import type { TextToSpeechInput, TextToSpeechOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
 type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
 
-interface TextToSpeechOutput {
-	audio: Blob;
-}
 /**
  * This task synthesize an audio of a voice pronouncing a given text.
  * Recommended model: espnet/kan-bayashi_ljspeech_vits
diff --git a/packages/inference/src/tasks/cv/textToImage.ts b/packages/inference/src/tasks/cv/textToImage.ts
@@ -31,7 +31,6 @@ export async function textToImage(args: TextToImageArgs, options?: Options): Pro
 		...options,
 		taskHint: "text-to-image",
 	});
-	console.log(res);
 	if (res && typeof res === "object") {
 		if (args.provider === "fal-ai" && "images" in res && Array.isArray(res.images) && res.images[0].url) {
 			const image = await fetch(res.images[0].url);
diff --git a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
@@ -4,9 +4,15 @@ import { request } from "../custom/request";
 import type { RequestArgs } from "../../types";
 import { toArray } from "../../utils/toArray";
 import { base64FromBytes } from "../../utils/base64FromBytes";
-import type { DocumentQuestionAnsweringInput, DocumentQuestionAnsweringOutput } from "@huggingface/tasks";
+import type {
+	DocumentQuestionAnsweringInput,
+	DocumentQuestionAnsweringInputData,
+	DocumentQuestionAnsweringOutput,
+} from "@huggingface/tasks";
 
-export type DocumentQuestionAnsweringArgs = BaseArgs & DocumentQuestionAnsweringInput;
+/// Override the type to properly set inputs.image as Blob
+export type DocumentQuestionAnsweringArgs = BaseArgs &
+	DocumentQuestionAnsweringInput & { inputs: DocumentQuestionAnsweringInputData & { image: Blob } };
 
 /**
  * Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
@@ -20,22 +26,30 @@ export async function documentQuestionAnswering(
 		inputs: {
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
-			image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer())),
+			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
 		},
 	} as RequestArgs;
 	const res = toArray(
-		await request<[DocumentQuestionAnsweringOutput] | DocumentQuestionAnsweringOutput>(reqArgs, {
+		await request<DocumentQuestionAnsweringOutput | DocumentQuestionAnsweringOutput[number]>(reqArgs, {
 			...options,
 			taskHint: "document-question-answering",
 		})
-	)?.[0];
+	);
+
 	const isValidOutput =
-		typeof res?.answer === "string" &&
-		(typeof res.end === "number" || typeof res.end === "undefined") &&
-		(typeof res.score === "number" || typeof res.score === "undefined") &&
-		(typeof res.start === "number" || typeof res.start === "undefined");
+		Array.isArray(res) &&
+		res.every(
+			(elem) =>
+				typeof elem === "object" &&
+				!!elem &&
+				typeof elem?.answer === "string" &&
+				(typeof elem.end === "number" || typeof elem.end === "undefined") &&
+				(typeof elem.score === "number" || typeof elem.score === "undefined") &&
+				(typeof elem.start === "number" || typeof elem.start === "undefined")
+		);
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, end?: number, score?: number, start?: number}>");
 	}
+
 	return res;
 }
diff --git a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
@@ -1,10 +1,16 @@
-import type { VisualQuestionAnsweringInput, VisualQuestionAnsweringOutput } from "@huggingface/tasks";
+import type {
+	VisualQuestionAnsweringInput,
+	VisualQuestionAnsweringInputData,
+	VisualQuestionAnsweringOutput,
+} from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
 
-export type VisualQuestionAnsweringArgs = BaseArgs & VisualQuestionAnsweringInput;
+/// Override the type to properly set inputs.image as Blob
+export type VisualQuestionAnsweringArgs = BaseArgs &
+	VisualQuestionAnsweringInput & { inputs: VisualQuestionAnsweringInputData & { image: Blob } };
 
 /**
  * Answers a question on an image. Recommended model: dandelin/vilt-b32-finetuned-vqa.
@@ -18,18 +24,18 @@ export async function visualQuestionAnswering(
 		inputs: {
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
-			image: base64FromBytes(
-				new Uint8Array(args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.arrayBuffer())
-			),
+			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
 		},
 	} as RequestArgs;
-	const res = (
-		await request<[VisualQuestionAnsweringOutput]>(reqArgs, {
-			...options,
-			taskHint: "visual-question-answering",
-		})
-	)?.[0];
-	const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
+	const res = await request<VisualQuestionAnsweringOutput>(reqArgs, {
+		...options,
+		taskHint: "visual-question-answering",
+	});
+	const isValidOutput =
+		Array.isArray(res) &&
+		res.every(
+			(elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && typeof elem.score === "number"
+		);
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
 	}
diff --git a/packages/inference/src/tasks/nlp/questionAnswering.ts b/packages/inference/src/tasks/nlp/questionAnswering.ts
@@ -16,22 +16,22 @@ export async function questionAnswering(
 		...options,
 		taskHint: "question-answering",
 	});
-	const isValidOutput =
-		Array.isArray(res) ?
-			res.every(
+	const isValidOutput = Array.isArray(res)
+		? res.every(
 				(elem) =>
 					typeof elem === "object" &&
 					!!elem &&
 					typeof elem.answer === "string" &&
 					typeof elem.end === "number" &&
 					typeof elem.score === "number" &&
 					typeof elem.start === "number"
-			) : (typeof res === "object" &&
-				!!res &&
-				typeof res.answer === "string" &&
-				typeof res.end === "number" &&
-				typeof res.score === "number" &&
-				typeof res.start === "number");
+		  )
+		: typeof res === "object" &&
+		  !!res &&
+		  typeof res.answer === "string" &&
+		  typeof res.end === "number" &&
+		  typeof res.score === "number" &&
+		  typeof res.start === "number";
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
 	}
diff --git a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
@@ -12,24 +12,34 @@ export async function tableQuestionAnswering(
 	args: TableQuestionAnsweringArgs,
 	options?: Options
 ): Promise<TableQuestionAnsweringOutput> {
-	const res = await request<TableQuestionAnsweringOutput>(args, {
+	const res = await request<TableQuestionAnsweringOutput | TableQuestionAnsweringOutput[number]>(args, {
 		...options,
 		taskHint: "table-question-answering",
 	});
-	const isValidOutput =
-		Array.isArray(res) &&
-		res.every((elem) => {
-			typeof elem?.aggregator === "string" &&
-				typeof elem.answer === "string" &&
-				Array.isArray(elem.cells) &&
-				elem.cells.every((x) => typeof x === "string") &&
-				Array.isArray(elem.coordinates) &&
-				elem.coordinates.every((coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number"));
-		});
+	const isValidOutput = Array.isArray(res) ? res.every((elem) => validate(elem)) : validate(res);
 	if (!isValidOutput) {
 		throw new InferenceOutputError(
 			"Expected {aggregator: string, answer: string, cells: string[], coordinates: number[][]}"
 		);
 	}
-	return res;
+	return Array.isArray(res) ? res : [res];
+}
+
+function validate(elem: unknown): elem is TableQuestionAnsweringOutput[number] {
+	return (
+		typeof elem === "object" &&
+		!!elem &&
+		"aggregator" in elem &&
+		typeof elem.aggregator === "string" &&
+		"answer" in elem &&
+		typeof elem.answer === "string" &&
+		"cells" in elem &&
+		Array.isArray(elem.cells) &&
+		elem.cells.every((x: unknown): x is string => typeof x === "string") &&
+		"coordinates" in elem &&
+		Array.isArray(elem.coordinates) &&
+		elem.coordinates.every(
+			(coord: unknown): coord is number[] => Array.isArray(coord) && coord.every((x) => typeof x === "number")
+		)
+	);
 }
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
@@ -1,6 +1,6 @@
 import { expect, it, describe, assert } from "vitest";
 
-import type { ChatCompletionStreamOutput } from "@huggingface/tasks";
+import type { ChatCompletionStreamOutput, VisualQuestionAnsweringInput } from "@huggingface/tasks";
 
 import { chatCompletion, HfInference } from "../src";
 import "./vcr";
@@ -87,13 +87,14 @@ describe.concurrent("HfInference", () => {
 						context: "The capital of France is Paris.",
 					},
 				});
-
-				expect(res).toMatchObject([{
-					answer: "Paris",
-					score: expect.any(Number),
-					start: expect.any(Number),
-					end: expect.any(Number),
-				}]);
+				expect(res).toMatchObject([
+					{
+						answer: "Paris",
+						score: expect.any(Number),
+						start: expect.any(Number),
+						end: expect.any(Number),
+					},
+				]);
 			});
 
 			it("tableQuestionAnswering", async () => {
@@ -110,30 +111,31 @@ describe.concurrent("HfInference", () => {
 							},
 						},
 					})
-				).toMatchObject({
-					answer: "AVERAGE > 36542",
-					coordinates: [[0, 1]],
-					cells: ["36542"],
-					aggregator: "AVERAGE",
-				});
+				).toMatchObject([
+					{
+						answer: "AVERAGE > 36542",
+						coordinates: [[0, 1]],
+						cells: ["36542"],
+						aggregator: "AVERAGE",
+					},
+				]);
 			});
 
 			it("documentQuestionAnswering", async () => {
-				expect(
-					await hf.documentQuestionAnswering({
-						model: "impira/layoutlm-document-qa",
-						inputs: {
-							question: "Invoice number?",
-							image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
-						},
-					})
-				).toMatchObject({
-					answer: "us-001",
-					score: expect.any(Number),
-					// not sure what start/end refers to in this case
-					start: expect.any(Number),
-					end: expect.any(Number),
+				const res = await hf.documentQuestionAnswering({
+					model: "impira/layoutlm-document-qa",
+					inputs: {
+						question: "Invoice number?",
+						image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
+					},
 				});
+				expect(res).toBeInstanceOf(Array);
+				for (const elem of res) {
+					expect(elem).toMatchObject({
+						answer: expect.any(String),
+						score: expect.any(Number),
+					});
+				}
 			});
 
 			// Errors with "Error: If you are using a VisionEncoderDecoderModel, you must provide a feature extractor"
@@ -152,18 +154,20 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("visualQuestionAnswering", async () => {
-				expect(
-					await hf.visualQuestionAnswering({
-						model: "dandelin/vilt-b32-finetuned-vqa",
-						inputs: {
-							question: "How many cats are lying down?",
-							image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
-						},
-					})
-				).toMatchObject({
-					answer: "2",
-					score: expect.any(Number),
-				});
+				const res = await hf.visualQuestionAnswering({
+					model: "dandelin/vilt-b32-finetuned-vqa",
+					inputs: {
+						question: "How many cats are lying down?",
+						image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
+					},
+				} satisfies VisualQuestionAnsweringInput);
+				expect(res).toBeInstanceOf(Array);
+				for (const elem of res) {
+					expect(elem).toMatchObject({
+						answer: expect.any(String),
+						score: expect.any(Number),
+					});
+				}
 			});
 
 			it("textClassification", async () => {
@@ -451,7 +455,9 @@ describe.concurrent("HfInference", () => {
 						model: "espnet/kan-bayashi_ljspeech_vits",
 						inputs: "hello there!",
 					})
-				).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				).toMatchObject({
+					audio: expect.any(Blob),
+				});
 			});
 
 			it("imageClassification", async () => {
@@ -473,7 +479,7 @@ describe.concurrent("HfInference", () => {
 			it("zeroShotImageClassification", async () => {
 				expect(
 					await hf.zeroShotImageClassification({
-						inputs: { image: new Blob([readTestFile("cheetah.png")], { type: "image/png" }) },
+						inputs: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
 						model: "openai/clip-vit-large-patch14-336",
 						parameters: {
 							candidate_labels: ["animal", "toy", "car"],
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,3 +1,17 @@
+/**
+ * Outputs of inference for the Text To Speech task
+ */
+export interface TextToSpeechOutput {
+	/**
+	 * The generated audio
+	 */
+	audio: Blob;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	sampling_rate?: number;
+	[property: string]: unknown;
+}
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
@@ -117,19 +131,3 @@ export interface GenerationParameters {
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-/**
- * Outputs for Text to Speech inference
- *
- * Outputs of inference for the Text To Audio task
- */
-export interface TextToSpeechOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	sampling_rate: number;
-	[property: string]: unknown;
-}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -1,7 +1,18 @@
 {
-	"$ref": "/inference/schemas/text-to-audio/output.json",
 	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Speech task",
 	"title": "TextToSpeechOutput",
-	"description": "Outputs for Text to Speech inference"
+	"type": "object",
+	"properties": {
+		"audio": {
+			"description": "The generated audio",
+			"comment": "type=binary"
+		},
+		"sampling_rate": {
+			"type": "number",
+			"description": "The sampling rate of the generated audio waveform."
+		}
+	},
+	"required": ["audio"]
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts