From 5bf54937022247f7efb937013bcab84834d4ede3 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 13:23:19 +0100
Subject: [PATCH 01/23] Inference: use Blob for audio & image parameters

---
 packages/agents/pnpm-lock.yaml                | 13 ++-
 .../tasks-gen/scripts/inference-codegen.ts    | 92 ++++++++++++++++++-
 .../tasks/audio-classification/inference.ts   |  2 +-
 .../audio-classification/spec/input.json      |  3 +-
 .../automatic-speech-recognition/inference.ts |  8 +-
 .../spec/input.json                           |  3 +-
 .../src/tasks/chat-completion/inference.ts    | 33 -------
 .../src/tasks/depth-estimation/inference.ts   |  6 +-
 .../spec/input.json                           |  3 +-
 .../src/tasks/feature-extraction/inference.ts |  3 -
 .../tasks/image-classification/inference.ts   |  2 +-
 .../image-classification/spec/input.json      |  3 +-
 .../src/tasks/image-segmentation/inference.ts |  2 +-
 .../tasks/image-segmentation/spec/input.json  |  3 +-
 .../src/tasks/image-to-image/inference.ts     |  6 +-
 .../src/tasks/image-to-image/spec/input.json  |  3 +-
 .../src/tasks/image-to-text/inference.ts      |  7 +-
 .../src/tasks/image-to-text/spec/input.json   |  3 +-
 .../src/tasks/object-detection/inference.ts   |  2 +-
 .../tasks/object-detection/spec/input.json    |  3 +-
 .../tasks/sentence-similarity/inference.ts    |  7 +-
 .../src/tasks/summarization/inference.ts      |  8 +-
 .../src/tasks/text-generation/inference.ts    | 13 ---
 .../src/tasks/text-to-audio/inference.ts      | 35 +++----
 .../src/tasks/text-to-audio/spec/output.json  |  3 +-
 .../src/tasks/text-to-image/inference.ts      |  4 -
 .../src/tasks/text-to-speech/inference.ts     |  5 -
 .../tasks/text2text-generation/inference.ts   |  8 +-
 .../tasks/src/tasks/translation/inference.ts  |  8 +-
 .../visual-question-answering/spec/input.json |  3 +-
 .../inference.ts                              |  2 +-
 .../spec/input.json                           |  3 +-
 .../zero-shot-object-detection/inference.ts   |  2 +-
 .../spec/input.json                           |  3 +-
 34 files changed, 165 insertions(+), 139 deletions(-)
diff --git a/packages/agents/pnpm-lock.yaml b/packages/agents/pnpm-lock.yaml
index 060aacb353..455c7460ab 100644
--- a/packages/agents/pnpm-lock.yaml
+++ b/packages/agents/pnpm-lock.yaml
@@ -7,7 +7,7 @@ settings:
 dependencies:
   '@huggingface/inference':
     specifier: ^2.6.1
-    version: link:../inference
+    version: 2.8.1
 
 devDependencies:
   '@types/node':
@@ -16,6 +16,17 @@ devDependencies:
 
 packages:
 
+  /@huggingface/inference@2.8.1:
+    resolution: {integrity: sha512-EfsNtY9OR6JCNaUa5bZu2mrs48iqeTz0Gutwf+fU0Kypx33xFQB4DKMhp8u4Ee6qVbLbNWvTHuWwlppLQl4p4Q==}
+    engines: {node: '>=18'}
+    dependencies:
+      '@huggingface/tasks': 0.12.30
+    dev: false
+
+  /@huggingface/tasks@0.12.30:
+    resolution: {integrity: sha512-A1ITdxbEzx9L8wKR8pF7swyrTLxWNDFIGDLUWInxvks2ruQ8PLRBZe8r0EcjC3CDdtlj9jV1V4cgV35K/iy3GQ==}
+    dev: false
+
   /@types/node@18.13.0:
     resolution: {integrity: sha512-gC3TazRzGoOnoKAhUx+Q0t8S9Tzs74z7m0ipwGpSqQrleP14hKxP4/JUeEQcD3W1/aIpnWl8pHowI7WokuZpXg==}
     dev: true
diff --git a/packages/tasks-gen/scripts/inference-codegen.ts b/packages/tasks-gen/scripts/inference-codegen.ts
index 2b53751191..faf55f81c3 100644
--- a/packages/tasks-gen/scripts/inference-codegen.ts
+++ b/packages/tasks-gen/scripts/inference-codegen.ts
@@ -94,6 +94,93 @@ async function generatePython(inputData: InputData): Promise<SerializedRenderRes
 	});
 }
 
+interface JSONSchemaSpec {
+	[param: string]: string | JSONSchemaSpec
+};
+
+async function postProcessOutput(
+	path2generated: string,
+	outputSpec: JSONSchemaSpec,
+	inputSpec: JSONSchemaSpec
+): Promise<void> {
+	await generateTopLevelArrays(path2generated, outputSpec);
+	await generateBinaryInputTypes(path2generated, inputSpec, outputSpec);
+}
+
+async function generateBinaryInputTypes(
+	path2generated: string,
+	inputSpec: JSONSchemaSpec,
+	outputSpec: JSONSchemaSpec
+): Promise<void> {
+	const tsSource = ts.createSourceFile(
+		path.basename(path2generated),
+		await fs.readFile(path2generated, { encoding: "utf-8" }),
+		ts.ScriptTarget.ES2022
+	);
+
+	const inputRootName = inputSpec.title;
+	const outputRootName = outputSpec.title;
+	if (typeof inputRootName !== "string" || typeof outputRootName !== "string") {
+		return;
+	}
+	const topLevelNodes = tsSource.getChildAt(0).getChildren();
+
+	let newNodes = [...topLevelNodes];
+
+	for (const interfaceNode of topLevelNodes.filter(
+		(node): node is ts.InterfaceDeclaration => node.kind === ts.SyntaxKind.InterfaceDeclaration
+	)) {
+		if (interfaceNode.name.escapedText !== inputRootName && interfaceNode.name.escapedText !== outputRootName) {
+			continue;
+		}
+
+		const spec = interfaceNode.name.escapedText === inputRootName ? inputSpec : outputSpec;
+
+		interfaceNode.forEachChild((child) => {
+			if (child.kind !== ts.SyntaxKind.PropertySignature) {
+				return;
+			}
+			const propSignature = child as ts.PropertySignature;
+			if (!propSignature.type) {
+				return;
+			}
+			const propName = propSignature.name.getText(tsSource);
+			const propIsMedia = !!spec["properties"]?.[propName]?.["comment"]?.includes("type=binary");
+			if (!propIsMedia) {
+				return;
+			}
+			const updatedType = ts.factory.createTypeReferenceNode("Blob");
+			const updated = ts.factory.updatePropertySignature(
+				propSignature,
+				propSignature.modifiers,
+				propSignature.name,
+				propSignature.questionToken,
+				updatedType
+			);
+			const updatedInterface = ts.factory.updateInterfaceDeclaration(
+				interfaceNode,
+				interfaceNode.modifiers,
+				interfaceNode.name,
+				interfaceNode.typeParameters,
+				interfaceNode.heritageClauses,
+				[updated, ...interfaceNode.members.filter((member) => member.name?.getText(tsSource) !== propName)]
+			);
+			newNodes = [updatedInterface, ...newNodes.filter((node) => node !== interfaceNode)];
+		});
+	}
+	const printer = ts.createPrinter();
+	console.log(printer.printList(ts.ListFormat.MultiLine, ts.factory.createNodeArray(newNodes), tsSource));
+
+	await fs.writeFile(
+		path2generated,
+		printer.printList(ts.ListFormat.MultiLine, ts.factory.createNodeArray(newNodes), tsSource),
+		{
+			flag: "w+",
+			encoding: "utf-8",
+		}
+	);
+}
+
 /**
  * quicktype is unable to generate "top-level array types" that are defined in the output spec: https://github.com/glideapps/quicktype/issues/2481
  * We have to use the TypeScript API to generate those types when required.
@@ -105,7 +192,7 @@ async function generatePython(inputData: InputData): Promise<SerializedRenderRes
  * And writes that to the `inference.ts` file
  *
  */
-async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
+async function generateTopLevelArrays(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
 	const source = ts.createSourceFile(
 		path.basename(path2generated),
 		await fs.readFile(path2generated, { encoding: "utf-8" }),
@@ -208,9 +295,10 @@ for (const { task, dirPath } of allTasks) {
 	}
 
 	const outputSpec = JSON.parse(await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }));
+	const inputSpec = JSON.parse(await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }));
 
 	console.log("   🩹 Post-processing the generated code");
-	await postProcessOutput(`${dirPath}/inference.ts`, outputSpec);
+	await postProcessOutput(`${dirPath}/inference.ts`, outputSpec, inputSpec);
 
 	console.debug("   🏭 Generating Python code");
 	{
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 9d3bfb1daa..5a87b2e46c 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -11,7 +11,7 @@ export interface AudioClassificationInput {
 	 * The input audio data as a base64-encoded string. If no `parameters` are provided, you can
 	 * also provide the audio data as a raw bytes payload.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Audio Classification
 	 */
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index dc8036b9b9..df86e333b4 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
-			"type": "string"
+			"type": "string",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Audio Classification",
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index d105c16e86..7070dbdebc 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Automatic Speech Recognition inference
  */
@@ -12,14 +11,13 @@ export interface AutomaticSpeechRecognitionInput {
 	 * The input audio data as a base64-encoded string. If no `parameters` are provided, you can
 	 * also provide the audio data as a raw bytes payload.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Automatic Speech Recognition
 	 */
 	parameters?: AutomaticSpeechRecognitionParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Automatic Speech Recognition
  */
@@ -34,7 +32,6 @@ export interface AutomaticSpeechRecognitionParameters {
 	return_timestamps?: boolean;
 	[property: string]: unknown;
 }
-
 /**
  * Parametrization of the text generation process
  */
@@ -121,12 +118,10 @@ export interface GenerationParameters {
 	use_cache?: boolean;
 	[property: string]: unknown;
 }
-
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
@@ -142,7 +137,6 @@ export interface AutomaticSpeechRecognitionOutput {
 	text: string;
 	[property: string]: unknown;
 }
-
 export interface AutomaticSpeechRecognitionOutputChunk {
 	/**
 	 * A chunk of text identified by the model
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index 98f1bdf5b2..b1e050e75e 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
-			"type": "string"
+			"type": "string",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Automatic Speech Recognition",
diff --git a/packages/tasks/src/tasks/chat-completion/inference.ts b/packages/tasks/src/tasks/chat-completion/inference.ts
index 4c4449a733..c5ae2ae29e 100644
--- a/packages/tasks/src/tasks/chat-completion/inference.ts
+++ b/packages/tasks/src/tasks/chat-completion/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Chat Completion Input.
  *
@@ -105,30 +104,24 @@ export interface ChatCompletionInput {
 	top_p?: number;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionInputMessage {
 	content: ChatCompletionInputMessageContent;
 	name?: string;
 	role: string;
 	[property: string]: unknown;
 }
-
 export type ChatCompletionInputMessageContent = ChatCompletionInputMessageChunk[] | string;
-
 export interface ChatCompletionInputMessageChunk {
 	image_url?: ChatCompletionInputURL;
 	text?: string;
 	type: ChatCompletionInputMessageChunkType;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionInputURL {
 	url: string;
 	[property: string]: unknown;
 }
-
 export type ChatCompletionInputMessageChunkType = "text" | "image_url";
-
 export interface ChatCompletionInputGrammarType {
 	type: ChatCompletionInputGrammarTypeType;
 	/**
@@ -140,9 +133,7 @@ export interface ChatCompletionInputGrammarType {
 	value: unknown;
 	[property: string]: unknown;
 }
-
 export type ChatCompletionInputGrammarTypeType = "json" | "regex";
-
 export interface ChatCompletionInputStreamOptions {
 	/**
 	 * If set, an additional chunk will be streamed before the data: [DONE] message. The usage
@@ -153,13 +144,11 @@ export interface ChatCompletionInputStreamOptions {
 	include_usage: boolean;
 	[property: string]: unknown;
 }
-
 /**
  *
  * <https://platform.openai.com/docs/guides/function-calling/configuring-function-calling-behavior-using-the-tool_choice-parameter>
  */
 export type ChatCompletionInputToolChoice = ChatCompletionInputToolChoiceEnum | ChatCompletionInputToolChoiceObject;
-
 /**
  * Means the model can pick between generating a message or calling one or more tools.
  *
@@ -168,30 +157,25 @@ export type ChatCompletionInputToolChoice = ChatCompletionInputToolChoiceEnum |
  * Means the model must call one or more tools.
  */
 export type ChatCompletionInputToolChoiceEnum = "auto" | "none" | "required";
-
 export interface ChatCompletionInputToolChoiceObject {
 	function: ChatCompletionInputFunctionName;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionInputFunctionName {
 	name: string;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionInputTool {
 	function: ChatCompletionInputFunctionDefinition;
 	type: string;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionInputFunctionDefinition {
 	arguments: unknown;
 	description?: string;
 	name: string;
 	[property: string]: unknown;
 }
-
 /**
  * Chat Completion Output.
  *
@@ -208,7 +192,6 @@ export interface ChatCompletionOutput {
 	usage: ChatCompletionOutputUsage;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionOutputComplete {
 	finish_reason: string;
 	index: number;
@@ -216,53 +199,45 @@ export interface ChatCompletionOutputComplete {
 	message: ChatCompletionOutputMessage;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionOutputLogprobs {
 	content: ChatCompletionOutputLogprob[];
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionOutputLogprob {
 	logprob: number;
 	token: string;
 	top_logprobs: ChatCompletionOutputTopLogprob[];
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionOutputTopLogprob {
 	logprob: number;
 	token: string;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionOutputMessage {
 	content?: string;
 	role: string;
 	tool_calls?: ChatCompletionOutputToolCall[];
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionOutputToolCall {
 	function: ChatCompletionOutputFunctionDefinition;
 	id: string;
 	type: string;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionOutputFunctionDefinition {
 	arguments: unknown;
 	description?: string;
 	name: string;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionOutputUsage {
 	completion_tokens: number;
 	prompt_tokens: number;
 	total_tokens: number;
 	[property: string]: unknown;
 }
-
 /**
  * Chat Completion Stream Output.
  *
@@ -279,7 +254,6 @@ export interface ChatCompletionStreamOutput {
 	usage?: ChatCompletionStreamOutputUsage;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionStreamOutputChoice {
 	delta: ChatCompletionStreamOutputDelta;
 	finish_reason?: string;
@@ -287,14 +261,12 @@ export interface ChatCompletionStreamOutputChoice {
 	logprobs?: ChatCompletionStreamOutputLogprobs;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionStreamOutputDelta {
 	content?: string;
 	role: string;
 	tool_calls?: ChatCompletionStreamOutputDeltaToolCall;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionStreamOutputDeltaToolCall {
 	function: ChatCompletionStreamOutputFunction;
 	id: string;
@@ -302,31 +274,26 @@ export interface ChatCompletionStreamOutputDeltaToolCall {
 	type: string;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionStreamOutputFunction {
 	arguments: string;
 	name?: string;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionStreamOutputLogprobs {
 	content: ChatCompletionStreamOutputLogprob[];
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionStreamOutputLogprob {
 	logprob: number;
 	token: string;
 	top_logprobs: ChatCompletionStreamOutputTopLogprob[];
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionStreamOutputTopLogprob {
 	logprob: number;
 	token: string;
 	[property: string]: unknown;
 }
-
 export interface ChatCompletionStreamOutputUsage {
 	completion_tokens: number;
 	prompt_tokens: number;
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 6b2cff1ff7..0e81e8de28 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Depth Estimation inference
  */
@@ -15,10 +14,11 @@ export interface DepthEstimationInput {
 	/**
 	 * Additional inference parameters for Depth Estimation
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
 /**
  * Outputs of inference for the Depth Estimation task
  */
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index e04e53436d..3a8035f71e 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -11,7 +11,8 @@
 			"title": "DocumentQuestionAnsweringInputData",
 			"properties": {
 				"image": {
-					"description": "The image on which the question is asked"
+					"description": "The image on which the question is asked",
+					"comment": "type=binary"
 				},
 				"question": {
 					"type": "string",
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index 404b103089..ad9b8103b1 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -3,9 +3,7 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 export type FeatureExtractionOutput = Array<number[]>;
-
 /**
  * Feature Extraction Input.
  *
@@ -36,5 +34,4 @@ export interface FeatureExtractionInput {
 	truncation_direction?: FeatureExtractionInputTruncationDirection;
 	[property: string]: unknown;
 }
-
 export type FeatureExtractionInputTruncationDirection = "Left" | "Right";
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 1f6fd103e3..923fbfec0e 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -11,7 +11,7 @@ export interface ImageClassificationInput {
 	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
 	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Image Classification
 	 */
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 3e2bd13d4d..4942226d29 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"type": "string",
-			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Image Classification",
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index e299d0e67a..72db730745 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -11,7 +11,7 @@ export interface ImageSegmentationInput {
 	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
 	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Image Segmentation
 	 */
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 2023f93f1c..afc40ad2db 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"type": "string",
-			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Image Segmentation",
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index 658bbd0122..9b5d776903 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Image To Image inference
  */
@@ -12,14 +11,13 @@ export interface ImageToImageInput {
 	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
 	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Image To Image
 	 */
 	parameters?: ImageToImageParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Image To Image
  */
@@ -44,7 +42,6 @@ export interface ImageToImageParameters {
 	target_size?: TargetSize;
 	[property: string]: unknown;
 }
-
 /**
  * The size in pixel of the output image.
  */
@@ -53,7 +50,6 @@ export interface TargetSize {
 	width: number;
 	[property: string]: unknown;
 }
-
 /**
  * Outputs of inference for the Image To Image task
  */
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index 5020d84cf1..431b815ef4 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"type": "string",
-			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Image To Image",
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index fd81b4e628..5c6bbe30c1 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Image To Text inference
  */
@@ -11,14 +10,13 @@ export interface ImageToTextInput {
 	/**
 	 * The input image data
 	 */
-	inputs: unknown;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Image To Text
 	 */
 	parameters?: ImageToTextParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Image To Text
  */
@@ -33,7 +31,6 @@ export interface ImageToTextParameters {
 	max_new_tokens?: number;
 	[property: string]: unknown;
 }
-
 /**
  * Parametrization of the text generation process
  */
@@ -120,12 +117,10 @@ export interface GenerationParameters {
 	use_cache?: boolean;
 	[property: string]: unknown;
 }
-
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-
 /**
  * Outputs of inference for the Image To Text task
  */
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index 7b3fd27562..26915b34a7 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -6,7 +6,8 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The input image data"
+			"description": "The input image data",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Image To Text",
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 4bb02bd012..58f330e681 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -11,7 +11,7 @@ export interface ObjectDetectionInput {
 	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
 	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Object Detection
 	 */
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index 948392bf06..55df78f56b 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"type": "string",
-			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Object Detection",
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 277aa4a83f..1d247d128f 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -3,9 +3,7 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 export type SentenceSimilarityOutput = number[];
-
 /**
  * Inputs for Sentence similarity inference
  */
@@ -14,10 +12,11 @@ export interface SentenceSimilarityInput {
 	/**
 	 * Additional inference parameters for Sentence Similarity
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
 export interface SentenceSimilarityInputData {
 	/**
 	 * A list of strings which will be compared against the source_sentence.
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index ecec9e97ed..ed28c5632f 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Summarization inference
  */
@@ -18,7 +17,6 @@ export interface SummarizationInput {
 	parameters?: SummarizationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for summarization.
  */
@@ -30,19 +28,19 @@ export interface SummarizationParameters {
 	/**
 	 * Additional parametrization of the text generation algorithm.
 	 */
-	generate_parameters?: { [key: string]: unknown };
+	generate_parameters?: {
+		[key: string]: unknown;
+	};
 	/**
 	 * The truncation strategy to use.
 	 */
 	truncation?: SummarizationTruncationStrategy;
 	[property: string]: unknown;
 }
-
 /**
  * The truncation strategy to use.
  */
 export type SummarizationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
-
 /**
  * Outputs of inference for the Summarization task
  */
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 67de53afa1..328d7d73ff 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Text Generation Input.
  *
@@ -17,7 +16,6 @@ export interface TextGenerationInput {
 	stream?: boolean;
 	[property: string]: unknown;
 }
-
 export interface TextGenerationInputGenerateParameters {
 	/**
 	 * Lora adapter id
@@ -100,7 +98,6 @@ export interface TextGenerationInputGenerateParameters {
 	watermark?: boolean;
 	[property: string]: unknown;
 }
-
 export interface TextGenerationInputGrammarType {
 	type: Type;
 	/**
@@ -112,9 +109,7 @@ export interface TextGenerationInputGrammarType {
 	value: unknown;
 	[property: string]: unknown;
 }
-
 export type Type = "json" | "regex";
-
 /**
  * Text Generation Output.
  *
@@ -127,7 +122,6 @@ export interface TextGenerationOutput {
 	generated_text: string;
 	[property: string]: unknown;
 }
-
 export interface TextGenerationOutputDetails {
 	best_of_sequences?: TextGenerationOutputBestOfSequence[];
 	finish_reason: TextGenerationOutputFinishReason;
@@ -138,7 +132,6 @@ export interface TextGenerationOutputDetails {
 	top_tokens?: Array<TextGenerationOutputToken[]>;
 	[property: string]: unknown;
 }
-
 export interface TextGenerationOutputBestOfSequence {
 	finish_reason: TextGenerationOutputFinishReason;
 	generated_text: string;
@@ -149,16 +142,13 @@ export interface TextGenerationOutputBestOfSequence {
 	top_tokens?: Array<TextGenerationOutputToken[]>;
 	[property: string]: unknown;
 }
-
 export type TextGenerationOutputFinishReason = "length" | "eos_token" | "stop_sequence";
-
 export interface TextGenerationOutputPrefillToken {
 	id: number;
 	logprob: number;
 	text: string;
 	[property: string]: unknown;
 }
-
 export interface TextGenerationOutputToken {
 	id: number;
 	logprob: number;
@@ -166,7 +156,6 @@ export interface TextGenerationOutputToken {
 	text: string;
 	[property: string]: unknown;
 }
-
 /**
  * Text Generation Stream Output.
  *
@@ -182,7 +171,6 @@ export interface TextGenerationStreamOutput {
 	top_tokens?: TextGenerationStreamOutputToken[];
 	[property: string]: unknown;
 }
-
 export interface TextGenerationStreamOutputStreamDetails {
 	finish_reason: TextGenerationOutputFinishReason;
 	generated_tokens: number;
@@ -190,7 +178,6 @@ export interface TextGenerationStreamOutputStreamDetails {
 	seed?: number;
 	[property: string]: unknown;
 }
-
 export interface TextGenerationStreamOutputToken {
 	id: number;
 	logprob: number;
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index f08aa87e1c..eed9c588b7 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,9 +1,23 @@
+/**
+ * Outputs of inference for the Text To Audio task
+ */
+export interface TextToAudioOutput {
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: Blob;
+	samplingRate: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	sampling_rate?: number;
+	[property: string]: unknown;
+}
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text To Audio inference
  */
@@ -18,7 +32,6 @@ export interface TextToAudioInput {
 	parameters?: TextToAudioParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Text To Audio
  */
@@ -29,7 +42,6 @@ export interface TextToAudioParameters {
 	generation_parameters?: GenerationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Parametrization of the text generation process
  */
@@ -116,24 +128,7 @@ export interface GenerationParameters {
 	use_cache?: boolean;
 	[property: string]: unknown;
 }
-
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-
-/**
- * Outputs of inference for the Text To Audio task
- */
-export interface TextToAudioOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	samplingRate: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	sampling_rate?: number;
-	[property: string]: unknown;
-}
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
index c171d62bff..9db5ddbf82 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -6,7 +6,8 @@
 	"type": "object",
 	"properties": {
 		"audio": {
-			"description": "The generated audio waveform."
+			"description": "The generated audio waveform.",
+			"comment": "type=binary"
 		},
 		"sampling_rate": {
 			"type": "number",
diff --git a/packages/tasks/src/tasks/text-to-image/inference.ts b/packages/tasks/src/tasks/text-to-image/inference.ts
index ae8a3b6a3a..0770cf6a63 100644
--- a/packages/tasks/src/tasks/text-to-image/inference.ts
+++ b/packages/tasks/src/tasks/text-to-image/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text To Image inference
  */
@@ -18,7 +17,6 @@ export interface TextToImageInput {
 	parameters?: TextToImageParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Text To Image
  */
@@ -51,7 +49,6 @@ export interface TextToImageParameters {
 	target_size?: TargetSize;
 	[property: string]: unknown;
 }
-
 /**
  * The size in pixel of the output image
  */
@@ -60,7 +57,6 @@ export interface TargetSize {
 	width: number;
 	[property: string]: unknown;
 }
-
 /**
  * Outputs of inference for the Text To Image task
  */
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 230aad902b..26d2218b57 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text To Speech inference
  */
@@ -18,7 +17,6 @@ export interface TextToSpeechInput {
 	parameters?: TextToSpeechParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Text To Speech
  */
@@ -29,7 +27,6 @@ export interface TextToSpeechParameters {
 	generation_parameters?: GenerationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Parametrization of the text generation process
  */
@@ -116,12 +113,10 @@ export interface GenerationParameters {
 	use_cache?: boolean;
 	[property: string]: unknown;
 }
-
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-
 /**
  * Outputs for Text to Speech inference
  *
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 6bd9dab815..0aaf87474d 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text2text Generation inference
  */
@@ -18,7 +17,6 @@ export interface Text2TextGenerationInput {
 	parameters?: Text2TextGenerationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Text2text Generation
  */
@@ -30,16 +28,16 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generate_parameters?: { [key: string]: unknown };
+	generate_parameters?: {
+		[key: string]: unknown;
+	};
 	/**
 	 * The truncation strategy to use
 	 */
 	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
-
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
-
 /**
  * Outputs of inference for the Text2text Generation task
  */
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 1e517e4c7d..caa8750330 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Translation inference
  */
@@ -18,7 +17,6 @@ export interface TranslationInput {
 	parameters?: TranslationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Translation
  */
@@ -30,7 +28,9 @@ export interface TranslationParameters {
 	/**
 	 * Additional parametrization of the text generation algorithm.
 	 */
-	generate_parameters?: { [key: string]: unknown };
+	generate_parameters?: {
+		[key: string]: unknown;
+	};
 	/**
 	 * The source language of the text. Required for models that can translate from multiple
 	 * languages.
@@ -47,12 +47,10 @@ export interface TranslationParameters {
 	truncation?: TranslationTruncationStrategy;
 	[property: string]: unknown;
 }
-
 /**
  * The truncation strategy to use.
  */
 export type TranslationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
-
 /**
  * Outputs of inference for the Translation task
  */
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index 0867bfcf24..f28c0453fd 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -11,7 +11,8 @@
 			"title": "VisualQuestionAnsweringInputData",
 			"properties": {
 				"image": {
-					"description": "The image."
+					"description": "The image.",
+					"comment": "type=binary"
 				},
 				"question": {
 					"description": "The question to answer based on the image."
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 594a4b5622..a700bb4f30 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -10,7 +10,7 @@ export interface ZeroShotImageClassificationInput {
 	/**
 	 * The input image data to classify as a base64-encoded string.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Zero Shot Image Classification
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 5795dd287d..7c3c413436 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"type": "string",
-			"description": "The input image data to classify as a base64-encoded string."
+			"description": "The input image data to classify as a base64-encoded string.",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Zero Shot Image Classification",
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 860609547c..e5ae3eb1ac 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -10,7 +10,7 @@ export interface ZeroShotObjectDetectionInput {
 	/**
 	 * The input image data as a base64-encoded string.
 	 */
-	inputs: string;
+	inputs: Blob;
 	/**
 	 * Additional inference parameters for Zero Shot Object Detection
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index d3532be05b..7e6264dee5 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -7,7 +7,8 @@
 	"properties": {
 		"inputs": {
 			"description": "The input image data as a base64-encoded string.",
-			"type": "string"
+			"type": "string",
+			"comment": "type=binary"
 		},
 		"parameters": {
 			"description": "Additional inference parameters for Zero Shot Object Detection",

From 580fe2b6492bc2c55d441fdecdd967559b1b76a6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 13:37:10 +0100
Subject: [PATCH 02/23] wip

---
 .../src/tasks/audio/audioClassification.ts    | 26 +++----------------
 .../inference/src/tasks/audio/audioToAudio.ts |  1 +
 .../tasks/audio/automaticSpeechRecognition.ts | 26 +++++--------------
 .../inference/src/tasks/audio/textToSpeech.ts | 10 ++-----
 .../tasks-gen/scripts/inference-codegen.ts    |  4 +--
 .../src/tasks/text-to-audio/inference.ts      |  3 +--
 .../src/tasks/text-to-audio/spec/output.json  |  2 +-
 .../src/tasks/text-to-speech/inference.ts     |  3 +--
 8 files changed, 19 insertions(+), 56 deletions(-)

diff --git a/packages/inference/src/tasks/audio/audioClassification.ts b/packages/inference/src/tasks/audio/audioClassification.ts
index 5d7e274e5a..5525be8b81 100644
--- a/packages/inference/src/tasks/audio/audioClassification.ts
+++ b/packages/inference/src/tasks/audio/audioClassification.ts
@@ -1,27 +1,9 @@
+import type { AudioClassificationInput, AudioClassificationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type AudioClassificationArgs = BaseArgs & {
-	/**
-	 * Binary audio data
-	 */
-	data: Blob | ArrayBuffer;
-};
-
-export interface AudioClassificationOutputValue {
-	/**
-	 * The label for the class (model specific)
-	 */
-	label: string;
-
-	/**
-	 * A float that represents how likely it is that the audio file belongs to this class.
-	 */
-	score: number;
-}
-
-export type AudioClassificationReturn = AudioClassificationOutputValue[];
+export type AudioClassificationArgs = BaseArgs & AudioClassificationInput;
 
 /**
  * This task reads some audio input and outputs the likelihood of classes.
@@ -30,8 +12,8 @@ export type AudioClassificationReturn = AudioClassificationOutputValue[];
 export async function audioClassification(
 	args: AudioClassificationArgs,
 	options?: Options
-): Promise<AudioClassificationReturn> {
-	const res = await request<AudioClassificationReturn>(args, {
+): Promise<AudioClassificationOutput> {
+	const res = await request<AudioClassificationOutput>(args, {
 		...options,
 		taskHint: "audio-classification",
 	});
diff --git a/packages/inference/src/tasks/audio/audioToAudio.ts b/packages/inference/src/tasks/audio/audioToAudio.ts
index c339cdf61a..016bd9d7f9 100644
--- a/packages/inference/src/tasks/audio/audioToAudio.ts
+++ b/packages/inference/src/tasks/audio/audioToAudio.ts
@@ -2,6 +2,7 @@ import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
+
 export type AudioToAudioArgs = BaseArgs & {
 	/**
 	 * Binary audio data
diff --git a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
index c56090c087..36fab8de1d 100644
--- a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
+++ b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
@@ -1,22 +1,10 @@
+import type { AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
-import type { BaseArgs, Options, RequestArgs } from "../../types";
+import type { BaseArgs, Options } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
 
-export type AutomaticSpeechRecognitionArgs = BaseArgs & {
-	/**
-	 * Binary audio data
-	 */
-	data: Blob | ArrayBuffer;
-};
-
-export interface AutomaticSpeechRecognitionOutput {
-	/**
-	 * The text that was recognized from the audio
-	 */
-	text: string;
-}
-
+export type AutomaticSpeechRecognitionArgs = BaseArgs & AutomaticSpeechRecognitionInput;
 /**
  * This task reads some audio input and outputs the said words within the audio files.
  * Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -26,12 +14,12 @@ export async function automaticSpeechRecognition(
 	options?: Options
 ): Promise<AutomaticSpeechRecognitionOutput> {
 	if (args.provider === "fal-ai") {
-		const contentType = args.data instanceof Blob ? args.data.type : "audio/mpeg";
+		const contentType = args.inputs.type;
 		const base64audio = base64FromBytes(
-			new Uint8Array(args.data instanceof ArrayBuffer ? args.data : await args.data.arrayBuffer())
+			new Uint8Array(await args.inputs.arrayBuffer())
 		);
-		(args as RequestArgs & { audio_url: string }).audio_url = `data:${contentType};base64,${base64audio}`;
-		delete (args as RequestArgs & { data: unknown }).data;
+		(args as AutomaticSpeechRecognitionArgs & { audio_url: string }).audio_url = `data:${contentType};base64,${base64audio}`;
+		delete (args as Omit<AutomaticSpeechRecognitionArgs, "inputs"> & { inputs?: unknown }).inputs;
 	}
 	const res = await request<AutomaticSpeechRecognitionOutput>(args, {
 		...options,
diff --git a/packages/inference/src/tasks/audio/textToSpeech.ts b/packages/inference/src/tasks/audio/textToSpeech.ts
index 3c466110f5..64aadf6c70 100644
--- a/packages/inference/src/tasks/audio/textToSpeech.ts
+++ b/packages/inference/src/tasks/audio/textToSpeech.ts
@@ -1,15 +1,9 @@
+import type { TextToSpeechInput, TextToSpeechOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type TextToSpeechArgs = BaseArgs & {
-	/**
-	 * The text to generate an audio from
-	 */
-	inputs: string;
-};
-
-export type TextToSpeechOutput = Blob;
+type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
 
 /**
  * This task synthesize an audio of a voice pronouncing a given text.
diff --git a/packages/tasks-gen/scripts/inference-codegen.ts b/packages/tasks-gen/scripts/inference-codegen.ts
index faf55f81c3..ae1ac72b11 100644
--- a/packages/tasks-gen/scripts/inference-codegen.ts
+++ b/packages/tasks-gen/scripts/inference-codegen.ts
@@ -103,8 +103,8 @@ async function postProcessOutput(
 	outputSpec: JSONSchemaSpec,
 	inputSpec: JSONSchemaSpec
 ): Promise<void> {
-	await generateTopLevelArrays(path2generated, outputSpec);
 	await generateBinaryInputTypes(path2generated, inputSpec, outputSpec);
+	await generateTopLevelArrays(path2generated, outputSpec);
 }
 
 async function generateBinaryInputTypes(
@@ -146,6 +146,7 @@ async function generateBinaryInputTypes(
 			}
 			const propName = propSignature.name.getText(tsSource);
 			const propIsMedia = !!spec["properties"]?.[propName]?.["comment"]?.includes("type=binary");
+			console.log(propName, propIsMedia)
 			if (!propIsMedia) {
 				return;
 			}
@@ -169,7 +170,6 @@ async function generateBinaryInputTypes(
 		});
 	}
 	const printer = ts.createPrinter();
-	console.log(printer.printList(ts.ListFormat.MultiLine, ts.factory.createNodeArray(newNodes), tsSource));
 
 	await fs.writeFile(
 		path2generated,
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index eed9c588b7..4a41019448 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -6,11 +6,10 @@ export interface TextToAudioOutput {
 	 * The generated audio waveform.
 	 */
 	audio: Blob;
-	samplingRate: unknown;
 	/**
 	 * The sampling rate of the generated audio waveform.
 	 */
-	sampling_rate?: number;
+	sampling_rate: number;
 	[property: string]: unknown;
 }
 /**
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
index 9db5ddbf82..8d39db79c1 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -14,5 +14,5 @@
 			"description": "The sampling rate of the generated audio waveform."
 		}
 	},
-	"required": ["audio", "samplingRate"]
+	"required": ["audio", "sampling_rate"]
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 26d2218b57..50ea306b38 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -127,10 +127,9 @@ export interface TextToSpeechOutput {
 	 * The generated audio waveform.
 	 */
 	audio: unknown;
-	samplingRate: unknown;
 	/**
 	 * The sampling rate of the generated audio waveform.
 	 */
-	sampling_rate?: number;
+	sampling_rate: number;
 	[property: string]: unknown;
 }

From 150fcb018712f22b7400f05b56a4c255652b9ba1 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 17:07:12 +0100
Subject: [PATCH 03/23] use types from tasks

---
 .../inference/src/tasks/audio/audioToAudio.ts |  1 -
 .../tasks/audio/automaticSpeechRecognition.ts | 16 ++++--
 .../src/tasks/cv/imageClassification.ts       | 21 +-------
 .../src/tasks/cv/imageSegmentation.ts         | 25 +--------
 .../inference/src/tasks/cv/imageToImage.ts    | 52 +------------------
 .../inference/src/tasks/cv/imageToText.ts     | 16 +-----
 .../inference/src/tasks/cv/objectDetection.ts | 31 +----------
 .../inference/src/tasks/cv/textToImage.ts     | 42 +--------------
 .../tasks/cv/zeroShotImageClassification.ts   | 23 +-------
 .../multimodal/documentQuestionAnswering.ts   | 32 +-----------
 .../multimodal/visualQuestionAnswering.ts     | 24 +--------
 packages/inference/src/tasks/nlp/fillMask.ts  | 24 +--------
 .../src/tasks/nlp/questionAnswering.ts        | 43 +++++----------
 .../src/tasks/nlp/sentenceSimilarity.ts       | 16 +-----
 .../inference/src/tasks/nlp/summarization.ts  | 44 +---------------
 .../src/tasks/nlp/tableQuestionAnswering.ts   | 46 ++++------------
 .../src/tasks/nlp/textClassification.ts       | 21 ++------
 .../src/tasks/nlp/tokenClassification.ts      | 49 +----------------
 .../inference/src/tasks/nlp/translation.ts    | 20 ++-----
 .../src/tasks/nlp/zeroShotClassification.ts   | 26 +---------
 20 files changed, 68 insertions(+), 504 deletions(-)

diff --git a/packages/inference/src/tasks/audio/audioToAudio.ts b/packages/inference/src/tasks/audio/audioToAudio.ts
index 016bd9d7f9..c339cdf61a 100644
--- a/packages/inference/src/tasks/audio/audioToAudio.ts
+++ b/packages/inference/src/tasks/audio/audioToAudio.ts
@@ -2,7 +2,6 @@ import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-
 export type AudioToAudioArgs = BaseArgs & {
 	/**
 	 * Binary audio data
diff --git a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
index 36fab8de1d..879cba18f6 100644
--- a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
+++ b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
@@ -15,10 +15,16 @@ export async function automaticSpeechRecognition(
 ): Promise<AutomaticSpeechRecognitionOutput> {
 	if (args.provider === "fal-ai") {
 		const contentType = args.inputs.type;
-		const base64audio = base64FromBytes(
-			new Uint8Array(await args.inputs.arrayBuffer())
-		);
-		(args as AutomaticSpeechRecognitionArgs & { audio_url: string }).audio_url = `data:${contentType};base64,${base64audio}`;
+		if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
+			throw new Error(
+				`Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
+					", "
+				)}`
+			);
+		}
+		const base64audio = base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer()));
+		(args as AutomaticSpeechRecognitionArgs & { audio_url: string }).audio_url =
+			`data:${contentType};base64,${base64audio}`;
 		delete (args as Omit<AutomaticSpeechRecognitionArgs, "inputs"> & { inputs?: unknown }).inputs;
 	}
 	const res = await request<AutomaticSpeechRecognitionOutput>(args, {
@@ -31,3 +37,5 @@ export async function automaticSpeechRecognition(
 	}
 	return res;
 }
+
+const FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav"];
diff --git a/packages/inference/src/tasks/cv/imageClassification.ts b/packages/inference/src/tasks/cv/imageClassification.ts
index 2ae7258704..41bfbdc20d 100644
--- a/packages/inference/src/tasks/cv/imageClassification.ts
+++ b/packages/inference/src/tasks/cv/imageClassification.ts
@@ -1,26 +1,9 @@
+import type { ImageClassificationInput, ImageClassificationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type ImageClassificationArgs = BaseArgs & {
-	/**
-	 * Binary image data
-	 */
-	data: Blob | ArrayBuffer;
-};
-
-export interface ImageClassificationOutputValue {
-	/**
-	 * The label for the class (model specific)
-	 */
-	label: string;
-	/**
-	 * A float that represents how likely it is that the image file belongs to this class.
-	 */
-	score: number;
-}
-
-export type ImageClassificationOutput = ImageClassificationOutputValue[];
+export type ImageClassificationArgs = BaseArgs & ImageClassificationInput;
 
 /**
  * This task reads some image input and outputs the likelihood of classes.
diff --git a/packages/inference/src/tasks/cv/imageSegmentation.ts b/packages/inference/src/tasks/cv/imageSegmentation.ts
index 171f065260..2b19d19e0d 100644
--- a/packages/inference/src/tasks/cv/imageSegmentation.ts
+++ b/packages/inference/src/tasks/cv/imageSegmentation.ts
@@ -1,30 +1,9 @@
+import type { ImageSegmentationInput, ImageSegmentationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type ImageSegmentationArgs = BaseArgs & {
-	/**
-	 * Binary image data
-	 */
-	data: Blob | ArrayBuffer;
-};
-
-export interface ImageSegmentationOutputValue {
-	/**
-	 * The label for the class (model specific) of a segment.
-	 */
-	label: string;
-	/**
-	 * A str (base64 str of a single channel black-and-white img) representing the mask of a segment.
-	 */
-	mask: string;
-	/**
-	 * A float that represents how likely it is that the detected object belongs to the given class.
-	 */
-	score: number;
-}
-
-export type ImageSegmentationOutput = ImageSegmentationOutputValue[];
+export type ImageSegmentationArgs = BaseArgs & ImageSegmentationInput;
 
 /**
  * This task reads some image input and outputs the likelihood of classes & bounding boxes of detected objects.
diff --git a/packages/inference/src/tasks/cv/imageToImage.ts b/packages/inference/src/tasks/cv/imageToImage.ts
index 5c18ccb111..5a0c83c7a1 100644
--- a/packages/inference/src/tasks/cv/imageToImage.ts
+++ b/packages/inference/src/tasks/cv/imageToImage.ts
@@ -1,58 +1,10 @@
+import type { ImageToImageInput, ImageToImageOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
 
-export type ImageToImageArgs = BaseArgs & {
-	/**
-	 * The initial image condition
-	 *
-	 **/
-	inputs: Blob | ArrayBuffer;
-
-	parameters?: {
-		/**
-		 * The text prompt to guide the image generation.
-		 */
-		prompt?: string;
-		/**
-		 * strengh param only works for SD img2img and alt diffusion img2img models
-		 * Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-		 * will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-		 * denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-		 * be maximum and the denoising process will run for the full number of iterations specified in
-		 * `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-		 **/
-		strength?: number;
-		/**
-		 * An optional negative prompt for the image generation
-		 */
-		negative_prompt?: string;
-		/**
-		 * The height in pixels of the generated image
-		 */
-		height?: number;
-		/**
-		 * The width in pixels of the generated image
-		 */
-		width?: number;
-		/**
-		 * The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
-		 */
-		num_inference_steps?: number;
-		/**
-		 * Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
-		 */
-		guidance_scale?: number;
-		/**
-		 * guess_mode only works for ControlNet models, defaults to False In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-		 * you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-		 */
-		guess_mode?: boolean;
-	};
-};
-
-export type ImageToImageOutput = Blob;
+export type ImageToImageArgs = BaseArgs & ImageToImageInput;
 
 /**
  * This task reads some text input and outputs an image.
diff --git a/packages/inference/src/tasks/cv/imageToText.ts b/packages/inference/src/tasks/cv/imageToText.ts
index 9dd3ae8c20..584142e9fd 100644
--- a/packages/inference/src/tasks/cv/imageToText.ts
+++ b/packages/inference/src/tasks/cv/imageToText.ts
@@ -1,21 +1,9 @@
+import type { ImageToTextInput, ImageToTextOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type ImageToTextArgs = BaseArgs & {
-	/**
-	 * Binary image data
-	 */
-	data: Blob | ArrayBuffer;
-};
-
-export interface ImageToTextOutput {
-	/**
-	 * The generated caption
-	 */
-	generated_text: string;
-}
-
+export type ImageToTextArgs = BaseArgs & ImageToTextInput;
 /**
  * This task reads some image input and outputs the text caption.
  */
diff --git a/packages/inference/src/tasks/cv/objectDetection.ts b/packages/inference/src/tasks/cv/objectDetection.ts
index 5bec721156..aed0102b1c 100644
--- a/packages/inference/src/tasks/cv/objectDetection.ts
+++ b/packages/inference/src/tasks/cv/objectDetection.ts
@@ -1,36 +1,9 @@
 import { request } from "../custom/request";
 import type { BaseArgs, Options } from "../../types";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
+import type { ObjectDetectionInput, ObjectDetectionOutput } from "@huggingface/tasks";
 
-export type ObjectDetectionArgs = BaseArgs & {
-	/**
-	 * Binary image data
-	 */
-	data: Blob | ArrayBuffer;
-};
-
-export interface ObjectDetectionOutputValue {
-	/**
-	 * A dict (with keys [xmin,ymin,xmax,ymax]) representing the bounding box of a detected object.
-	 */
-	box: {
-		xmax: number;
-		xmin: number;
-		ymax: number;
-		ymin: number;
-	};
-	/**
-	 * The label for the class (model specific) of a detected object.
-	 */
-	label: string;
-
-	/**
-	 * A float that represents how likely it is that the detected object belongs to the given class.
-	 */
-	score: number;
-}
-
-export type ObjectDetectionOutput = ObjectDetectionOutputValue[];
+export type ObjectDetectionArgs = BaseArgs & ObjectDetectionInput;
 
 /**
  * This task reads some image input and outputs the likelihood of classes & bounding boxes of detected objects.
diff --git a/packages/inference/src/tasks/cv/textToImage.ts b/packages/inference/src/tasks/cv/textToImage.ts
index d8527d6539..f3de677da5 100644
--- a/packages/inference/src/tasks/cv/textToImage.ts
+++ b/packages/inference/src/tasks/cv/textToImage.ts
@@ -1,47 +1,9 @@
+import type { TextToImageInput, TextToImageOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type TextToImageArgs = BaseArgs & {
-	/**
-	 * The text to generate an image from
-	 */
-	inputs: string;
-
-	/**
-	 * Same param but for external providers like Together, Replicate
-	 */
-	prompt?: string;
-	response_format?: "base64";
-	input?: {
-		prompt: string;
-	};
-
-	parameters?: {
-		/**
-		 * An optional negative prompt for the image generation
-		 */
-		negative_prompt?: string;
-		/**
-		 * The height in pixels of the generated image
-		 */
-		height?: number;
-		/**
-		 * The width in pixels of the generated image
-		 */
-		width?: number;
-		/**
-		 * The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
-		 */
-		num_inference_steps?: number;
-		/**
-		 * Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
-		 */
-		guidance_scale?: number;
-	};
-};
-
-export type TextToImageOutput = Blob;
+export type TextToImageArgs = BaseArgs & TextToImageInput;
 
 interface Base64ImageGeneration {
 	data: Array<{
diff --git a/packages/inference/src/tasks/cv/zeroShotImageClassification.ts b/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
index 062b86b211..7a0c7ece66 100644
--- a/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
+++ b/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
@@ -3,28 +3,9 @@ import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 import type { RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
+import type { ZeroShotImageClassificationInput, ZeroShotImageClassificationOutput } from "@huggingface/tasks";
 
-export type ZeroShotImageClassificationArgs = BaseArgs & {
-	inputs: {
-		/**
-		 * Binary image data
-		 */
-		image: Blob | ArrayBuffer;
-	};
-	parameters: {
-		/**
-		 * A list of strings that are potential classes for inputs. (max 10)
-		 */
-		candidate_labels: string[];
-	};
-};
-
-export interface ZeroShotImageClassificationOutputValue {
-	label: string;
-	score: number;
-}
-
-export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputValue[];
+export type ZeroShotImageClassificationArgs = BaseArgs & ZeroShotImageClassificationInput;
 
 /**
  * Classify an image to specified classes.
diff --git a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
index 205e956b9d..e272649383 100644
--- a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
@@ -4,37 +4,9 @@ import { request } from "../custom/request";
 import type { RequestArgs } from "../../types";
 import { toArray } from "../../utils/toArray";
 import { base64FromBytes } from "../../utils/base64FromBytes";
+import type { DocumentQuestionAnsweringInput, DocumentQuestionAnsweringOutput } from "@huggingface/tasks";
 
-export type DocumentQuestionAnsweringArgs = BaseArgs & {
-	inputs: {
-		/**
-		 * Raw image
-		 *
-		 * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
-		 **/
-		image: Blob | ArrayBuffer;
-		question: string;
-	};
-};
-
-export interface DocumentQuestionAnsweringOutput {
-	/**
-	 * A string that’s the answer within the document.
-	 */
-	answer: string;
-	/**
-	 * ?
-	 */
-	end?: number;
-	/**
-	 * A float that represents how likely that the answer is correct
-	 */
-	score?: number;
-	/**
-	 * ?
-	 */
-	start?: number;
-}
+export type DocumentQuestionAnsweringArgs = BaseArgs & DocumentQuestionAnsweringInput;
 
 /**
  * Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
diff --git a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
index 80e8a9a15c..e36008e8cc 100644
--- a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
@@ -1,30 +1,10 @@
+import type { VisualQuestionAnsweringInput, VisualQuestionAnsweringOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
 
-export type VisualQuestionAnsweringArgs = BaseArgs & {
-	inputs: {
-		/**
-		 * Raw image
-		 *
-		 * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
-		 **/
-		image: Blob | ArrayBuffer;
-		question: string;
-	};
-};
-
-export interface VisualQuestionAnsweringOutput {
-	/**
-	 * A string that’s the answer to a visual question.
-	 */
-	answer: string;
-	/**
-	 * Answer correctness score.
-	 */
-	score: number;
-}
+export type VisualQuestionAnsweringArgs = BaseArgs & VisualQuestionAnsweringInput;
 
 /**
  * Answers a question on an image. Recommended model: dandelin/vilt-b32-finetuned-vqa.
diff --git a/packages/inference/src/tasks/nlp/fillMask.ts b/packages/inference/src/tasks/nlp/fillMask.ts
index b8a2af1286..26f27e3557 100644
--- a/packages/inference/src/tasks/nlp/fillMask.ts
+++ b/packages/inference/src/tasks/nlp/fillMask.ts
@@ -1,29 +1,9 @@
+import type { FillMaskInput, FillMaskOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type FillMaskArgs = BaseArgs & {
-	inputs: string;
-};
-
-export type FillMaskOutput = {
-	/**
-	 * The probability for this token.
-	 */
-	score: number;
-	/**
-	 * The actual sequence of tokens that ran against the model (may contain special tokens)
-	 */
-	sequence: string;
-	/**
-	 * The id of the token
-	 */
-	token: number;
-	/**
-	 * The string representation of the token
-	 */
-	token_str: string;
-}[];
+export type FillMaskArgs = BaseArgs & FillMaskInput;
 
 /**
  * Tries to fill in a hole with a missing word (token to be precise). That’s the base task for BERT models.
diff --git a/packages/inference/src/tasks/nlp/questionAnswering.ts b/packages/inference/src/tasks/nlp/questionAnswering.ts
index 58074eb9c9..14f88da659 100644
--- a/packages/inference/src/tasks/nlp/questionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/questionAnswering.ts
@@ -1,32 +1,9 @@
+import type { QuestionAnsweringInput, QuestionAnsweringOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type QuestionAnsweringArgs = BaseArgs & {
-	inputs: {
-		context: string;
-		question: string;
-	};
-};
-
-export interface QuestionAnsweringOutput {
-	/**
-	 * A string that’s the answer within the text.
-	 */
-	answer: string;
-	/**
-	 * The index (string wise) of the stop of the answer within context.
-	 */
-	end: number;
-	/**
-	 * A float that represents how likely that the answer is correct
-	 */
-	score: number;
-	/**
-	 * The index (string wise) of the start of the answer within context.
-	 */
-	start: number;
-}
+export type QuestionAnsweringArgs = BaseArgs & QuestionAnsweringInput;
 
 /**
  * Want to have a nice know-it-all bot that can answer any question?. Recommended model: deepset/roberta-base-squad2
@@ -40,12 +17,16 @@ export async function questionAnswering(
 		taskHint: "question-answering",
 	});
 	const isValidOutput =
-		typeof res === "object" &&
-		!!res &&
-		typeof res.answer === "string" &&
-		typeof res.end === "number" &&
-		typeof res.score === "number" &&
-		typeof res.start === "number";
+		Array.isArray(res) &&
+		res.every(
+			(elem) =>
+				typeof elem === "object" &&
+				!!elem &&
+				typeof elem.answer === "string" &&
+				typeof elem.end === "number" &&
+				typeof elem.score === "number" &&
+				typeof elem.start === "number"
+		);
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected {answer: string, end: number, score: number, start: number}");
 	}
diff --git a/packages/inference/src/tasks/nlp/sentenceSimilarity.ts b/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
index ec5c173ca2..7cd423fc45 100644
--- a/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
+++ b/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
@@ -1,22 +1,10 @@
+import type { SentenceSimilarityInput, SentenceSimilarityOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import { getDefaultTask } from "../../lib/getDefaultTask";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type SentenceSimilarityArgs = BaseArgs & {
-	/**
-	 * The inputs vary based on the model.
-	 *
-	 * For example when using sentence-transformers/paraphrase-xlm-r-multilingual-v1 the inputs will have a `source_sentence` string and
-	 * a `sentences` array of strings
-	 */
-	inputs: Record<string, unknown> | Record<string, unknown>[];
-};
-
-/**
- * Returned values are a list of floats
- */
-export type SentenceSimilarityOutput = number[];
+export type SentenceSimilarityArgs = BaseArgs & SentenceSimilarityInput;
 
 /**
  * Calculate the semantic similarity between one text and a list of other sentences by comparing their embeddings.
diff --git a/packages/inference/src/tasks/nlp/summarization.ts b/packages/inference/src/tasks/nlp/summarization.ts
index 71efd1c3b9..6a855960dc 100644
--- a/packages/inference/src/tasks/nlp/summarization.ts
+++ b/packages/inference/src/tasks/nlp/summarization.ts
@@ -1,50 +1,10 @@
+import type { SummarizationInput, SummarizationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type SummarizationArgs = BaseArgs & {
-	/**
-	 * A string to be summarized
-	 */
-	inputs: string;
-	parameters?: {
-		/**
-		 * (Default: None). Integer to define the maximum length in tokens of the output summary.
-		 */
-		max_length?: number;
-		/**
-		 * (Default: None). Float (0-120.0). The amount of time in seconds that the query should take maximum. Network can cause some overhead so it will be a soft limit.
-		 */
-		max_time?: number;
-		/**
-		 * (Default: None). Integer to define the minimum length in tokens of the output summary.
-		 */
-		min_length?: number;
-		/**
-		 * (Default: None). Float (0.0-100.0). The more a token is used within generation the more it is penalized to not be picked in successive generation passes.
-		 */
-		repetition_penalty?: number;
-		/**
-		 * (Default: 1.0). Float (0.0-100.0). The temperature of the sampling operation. 1 means regular sampling, 0 means always take the highest score, 100.0 is getting closer to uniform probability.
-		 */
-		temperature?: number;
-		/**
-		 * (Default: None). Integer to define the top tokens considered within the sample operation to create new text.
-		 */
-		top_k?: number;
-		/**
-		 * (Default: None). Float to define the tokens that are within the sample operation of text generation. Add tokens in the sample for more probable to least probable until the sum of the probabilities is greater than top_p.
-		 */
-		top_p?: number;
-	};
-};
+export type SummarizationArgs = BaseArgs & SummarizationInput;
 
-export interface SummarizationOutput {
-	/**
-	 * The string after translation
-	 */
-	summary_text: string;
-}
 
 /**
  * This task is well known to summarize longer text into shorter text. Be careful, some models have a maximum length of input. That means that the summary cannot handle full books for instance. Be careful when choosing your model.
diff --git a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
index a0cf692512..c9660be8cb 100644
--- a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
@@ -1,38 +1,10 @@
+import type { TableQuestionAnsweringInput, TableQuestionAnsweringOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type TableQuestionAnsweringArgs = BaseArgs & {
-	inputs: {
-		/**
-		 * The query in plain text that you want to ask the table
-		 */
-		query: string;
-		/**
-		 * A table of data represented as a dict of list where entries are headers and the lists are all the values, all lists must have the same size.
-		 */
-		table: Record<string, string[]>;
-	};
-};
+export type TableQuestionAnsweringArgs = BaseArgs & TableQuestionAnsweringInput;
 
-export interface TableQuestionAnsweringOutput {
-	/**
-	 * The aggregator used to get the answer
-	 */
-	aggregator: string;
-	/**
-	 * The plaintext answer
-	 */
-	answer: string;
-	/**
-	 * A list of coordinates of the cells contents
-	 */
-	cells: string[];
-	/**
-	 * a list of coordinates of the cells referenced in the answer
-	 */
-	coordinates: number[][];
-}
 
 /**
  * Don’t know SQL? Don’t want to dive into a large spreadsheet? Ask questions in plain english! Recommended model: google/tapas-base-finetuned-wtq.
@@ -46,12 +18,14 @@ export async function tableQuestionAnswering(
 		taskHint: "table-question-answering",
 	});
 	const isValidOutput =
-		typeof res?.aggregator === "string" &&
-		typeof res.answer === "string" &&
-		Array.isArray(res.cells) &&
-		res.cells.every((x) => typeof x === "string") &&
-		Array.isArray(res.coordinates) &&
-		res.coordinates.every((coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number"));
+		Array.isArray(res) && res.every(elem => {
+			typeof elem?.aggregator === "string" &&
+				typeof elem.answer === "string" &&
+				Array.isArray(elem.cells) &&
+				elem.cells.every((x) => typeof x === "string") &&
+				Array.isArray(elem.coordinates) &&
+				elem.coordinates.every((coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number"))
+		});
 	if (!isValidOutput) {
 		throw new InferenceOutputError(
 			"Expected {aggregator: string, answer: string, cells: string[], coordinates: number[][]}"
diff --git a/packages/inference/src/tasks/nlp/textClassification.ts b/packages/inference/src/tasks/nlp/textClassification.ts
index 41ced40571..7c08cd816a 100644
--- a/packages/inference/src/tasks/nlp/textClassification.ts
+++ b/packages/inference/src/tasks/nlp/textClassification.ts
@@ -1,24 +1,9 @@
+import type { TextClassificationInput, TextClassificationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type TextClassificationArgs = BaseArgs & {
-	/**
-	 * A string to be classified
-	 */
-	inputs: string;
-};
-
-export type TextClassificationOutput = {
-	/**
-	 * The label for the class (model specific)
-	 */
-	label: string;
-	/**
-	 * A floats that represents how likely is that the text belongs to this class.
-	 */
-	score: number;
-}[];
+export type TextClassificationArgs = BaseArgs & TextClassificationInput;
 
 /**
  * Usually used for sentiment-analysis this will output the likelihood of classes of an input. Recommended model: distilbert-base-uncased-finetuned-sst-2-english
@@ -28,7 +13,7 @@ export async function textClassification(
 	options?: Options
 ): Promise<TextClassificationOutput> {
 	const res = (
-		await request<TextClassificationOutput[]>(args, {
+		await request<TextClassificationOutput>(args, {
 			...options,
 			taskHint: "text-classification",
 		})
diff --git a/packages/inference/src/tasks/nlp/tokenClassification.ts b/packages/inference/src/tasks/nlp/tokenClassification.ts
index eeee58d4c6..13d2a51451 100644
--- a/packages/inference/src/tasks/nlp/tokenClassification.ts
+++ b/packages/inference/src/tasks/nlp/tokenClassification.ts
@@ -1,55 +1,10 @@
+import type { TokenClassificationInput, TokenClassificationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { toArray } from "../../utils/toArray";
 import { request } from "../custom/request";
 
-export type TokenClassificationArgs = BaseArgs & {
-	/**
-	 * A string to be classified
-	 */
-	inputs: string;
-	parameters?: {
-		/**
-		 * (Default: simple). There are several aggregation strategies:
-		 *
-		 * none: Every token gets classified without further aggregation.
-		 *
-		 * simple: Entities are grouped according to the default schema (B-, I- tags get merged when the tag is similar).
-		 *
-		 * first: Same as the simple strategy except words cannot end up with different tags. Words will use the tag of the first token when there is ambiguity.
-		 *
-		 * average: Same as the simple strategy except words cannot end up with different tags. Scores are averaged across tokens and then the maximum label is applied.
-		 *
-		 * max: Same as the simple strategy except words cannot end up with different tags. Word entity will be the token with the maximum score.
-		 */
-		aggregation_strategy?: "none" | "simple" | "first" | "average" | "max";
-	};
-};
-
-export interface TokenClassificationOutputValue {
-	/**
-	 * The offset stringwise where the answer is located. Useful to disambiguate if word occurs multiple times.
-	 */
-	end: number;
-	/**
-	 * The type for the entity being recognized (model specific).
-	 */
-	entity_group: string;
-	/**
-	 * How likely the entity was recognized.
-	 */
-	score: number;
-	/**
-	 * The offset stringwise where the answer is located. Useful to disambiguate if word occurs multiple times.
-	 */
-	start: number;
-	/**
-	 * The string that was captured
-	 */
-	word: string;
-}
-
-export type TokenClassificationOutput = TokenClassificationOutputValue[];
+export type TokenClassificationArgs = BaseArgs & TokenClassificationInput;
 
 /**
  * Usually used for sentence parsing, either grammatical, or Named Entity Recognition (NER) to understand keywords contained within text. Recommended model: dbmdz/bert-large-cased-finetuned-conll03-english
diff --git a/packages/inference/src/tasks/nlp/translation.ts b/packages/inference/src/tasks/nlp/translation.ts
index ea7a3054c0..e92f302260 100644
--- a/packages/inference/src/tasks/nlp/translation.ts
+++ b/packages/inference/src/tasks/nlp/translation.ts
@@ -1,28 +1,14 @@
+import type { TranslationInput, TranslationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type TranslationArgs = BaseArgs & {
-	/**
-	 * A string to be translated
-	 */
-	inputs: string | string[];
-};
-
-export interface TranslationOutputValue {
-	/**
-	 * The string after translation
-	 */
-	translation_text: string;
-}
-
-export type TranslationOutput = TranslationOutputValue | TranslationOutputValue[];
-
+export type TranslationArgs = BaseArgs & TranslationInput;
 /**
  * This task is well known to translate text from one language to another. Recommended model: Helsinki-NLP/opus-mt-ru-en.
  */
 export async function translation(args: TranslationArgs, options?: Options): Promise<TranslationOutput> {
-	const res = await request<TranslationOutputValue[]>(args, {
+	const res = await request<TranslationOutput>(args, {
 		...options,
 		taskHint: "translation",
 	});
diff --git a/packages/inference/src/tasks/nlp/zeroShotClassification.ts b/packages/inference/src/tasks/nlp/zeroShotClassification.ts
index 2552489c36..933aaa53c5 100644
--- a/packages/inference/src/tasks/nlp/zeroShotClassification.ts
+++ b/packages/inference/src/tasks/nlp/zeroShotClassification.ts
@@ -1,32 +1,10 @@
+import type { ZeroShotClassificationInput, ZeroShotClassificationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { toArray } from "../../utils/toArray";
 import { request } from "../custom/request";
 
-export type ZeroShotClassificationArgs = BaseArgs & {
-	/**
-	 * a string or list of strings
-	 */
-	inputs: string | string[];
-	parameters: {
-		/**
-		 * a list of strings that are potential classes for inputs. (max 10 candidate_labels, for more, simply run multiple requests, results are going to be misleading if using too many candidate_labels anyway. If you want to keep the exact same, you can simply run multi_label=True and do the scaling on your end.
-		 */
-		candidate_labels: string[];
-		/**
-		 * (Default: false) Boolean that is set to True if classes can overlap
-		 */
-		multi_label?: boolean;
-	};
-};
-
-export interface ZeroShotClassificationOutputValue {
-	labels: string[];
-	scores: number[];
-	sequence: string;
-}
-
-export type ZeroShotClassificationOutput = ZeroShotClassificationOutputValue[];
+export type ZeroShotClassificationArgs = BaseArgs & ZeroShotClassificationInput;
 
 /**
  * This task is super useful to try out classification with zero code, you simply pass a sentence/paragraph and the possible labels for that sentence, and you get a result. Recommended model: facebook/bart-large-mnli.

From 5e63475a06e08bff88827dabd3686aa63f6c11b3 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 17:11:00 +0100
Subject: [PATCH 04/23] Update types

---
 packages/inference/src/tasks/cv/textToImage.ts            | 8 ++++----
 .../inference/src/tasks/cv/zeroShotImageClassification.ts | 6 +-----
 .../src/tasks/multimodal/documentQuestionAnswering.ts     | 4 +---
 packages/inference/src/tasks/nlp/summarization.ts         | 1 -
 .../inference/src/tasks/nlp/tableQuestionAnswering.ts     | 6 +++---
 5 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/packages/inference/src/tasks/cv/textToImage.ts b/packages/inference/src/tasks/cv/textToImage.ts
index f3de677da5..9d9a654687 100644
--- a/packages/inference/src/tasks/cv/textToImage.ts
+++ b/packages/inference/src/tasks/cv/textToImage.ts
@@ -34,23 +34,23 @@ export async function textToImage(args: TextToImageArgs, options?: Options): Pro
 	if (res && typeof res === "object") {
 		if (args.provider === "fal-ai" && "images" in res && Array.isArray(res.images) && res.images[0].url) {
 			const image = await fetch(res.images[0].url);
-			return await image.blob();
+			return { image: await image.blob() };
 		}
 		if ("data" in res && Array.isArray(res.data) && res.data[0].b64_json) {
 			const base64Data = res.data[0].b64_json;
 			const base64Response = await fetch(`data:image/jpeg;base64,${base64Data}`);
 			const blob = await base64Response.blob();
-			return blob;
+			return { image: blob };
 		}
 		if ("output" in res && Array.isArray(res.output)) {
 			const urlResponse = await fetch(res.output[0]);
 			const blob = await urlResponse.blob();
-			return blob;
+			return { image: blob };
 		}
 	}
 	const isValidOutput = res && res instanceof Blob;
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Blob");
 	}
-	return res;
+	return { image: res };
 }
diff --git a/packages/inference/src/tasks/cv/zeroShotImageClassification.ts b/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
index 7a0c7ece66..014cbe8126 100644
--- a/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
+++ b/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
@@ -18,11 +18,7 @@ export async function zeroShotImageClassification(
 	const reqArgs: RequestArgs = {
 		...args,
 		inputs: {
-			image: base64FromBytes(
-				new Uint8Array(
-					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
-				)
-			),
+			image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer())),
 		},
 	} as RequestArgs;
 
diff --git a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
index e272649383..28689c334d 100644
--- a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
@@ -21,9 +21,7 @@ export async function documentQuestionAnswering(
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
 			image: base64FromBytes(
-				new Uint8Array(
-					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
-				)
+				new Uint8Array(await args.inputs.arrayBuffer())
 			),
 		},
 	} as RequestArgs;
diff --git a/packages/inference/src/tasks/nlp/summarization.ts b/packages/inference/src/tasks/nlp/summarization.ts
index 6a855960dc..e9e093a788 100644
--- a/packages/inference/src/tasks/nlp/summarization.ts
+++ b/packages/inference/src/tasks/nlp/summarization.ts
@@ -5,7 +5,6 @@ import { request } from "../custom/request";
 
 export type SummarizationArgs = BaseArgs & SummarizationInput;
 
-
 /**
  * This task is well known to summarize longer text into shorter text. Be careful, some models have a maximum length of input. That means that the summary cannot handle full books for instance. Be careful when choosing your model.
  */
diff --git a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
index c9660be8cb..217985d663 100644
--- a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
@@ -5,7 +5,6 @@ import { request } from "../custom/request";
 
 export type TableQuestionAnsweringArgs = BaseArgs & TableQuestionAnsweringInput;
 
-
 /**
  * Don’t know SQL? Don’t want to dive into a large spreadsheet? Ask questions in plain english! Recommended model: google/tapas-base-finetuned-wtq.
  */
@@ -18,13 +17,14 @@ export async function tableQuestionAnswering(
 		taskHint: "table-question-answering",
 	});
 	const isValidOutput =
-		Array.isArray(res) && res.every(elem => {
+		Array.isArray(res) &&
+		res.every((elem) => {
 			typeof elem?.aggregator === "string" &&
 				typeof elem.answer === "string" &&
 				Array.isArray(elem.cells) &&
 				elem.cells.every((x) => typeof x === "string") &&
 				Array.isArray(elem.coordinates) &&
-				elem.coordinates.every((coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number"))
+				elem.coordinates.every((coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number"));
 		});
 	if (!isValidOutput) {
 		throw new InferenceOutputError(

From ef5693c53535b5907a592eebddb0d3380d14091c Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 17:20:39 +0100
Subject: [PATCH 05/23] update test

---
 .../tasks/audio/automaticSpeechRecognition.ts |  2 +-
 .../inference/src/tasks/audio/textToSpeech.ts |  9 +++++---
 .../inference/src/tasks/cv/imageToImage.ts    |  2 +-
 .../inference/src/tasks/cv/textToImage.ts     |  1 +
 .../multimodal/documentQuestionAnswering.ts   |  4 +---
 .../multimodal/visualQuestionAnswering.ts     |  4 +---
 packages/inference/test/HfInference.spec.ts   | 22 +++++++++----------
 7 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
index 879cba18f6..9becd0937b 100644
--- a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
+++ b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
@@ -38,4 +38,4 @@ export async function automaticSpeechRecognition(
 	return res;
 }
 
-const FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav"];
+const FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
diff --git a/packages/inference/src/tasks/audio/textToSpeech.ts b/packages/inference/src/tasks/audio/textToSpeech.ts
index 64aadf6c70..7a6073dad0 100644
--- a/packages/inference/src/tasks/audio/textToSpeech.ts
+++ b/packages/inference/src/tasks/audio/textToSpeech.ts
@@ -1,16 +1,19 @@
-import type { TextToSpeechInput, TextToSpeechOutput } from "@huggingface/tasks";
+import type { TextToSpeechInput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
 type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
 
+interface TextToSpeechOutput {
+	audio: Blob;
+}
 /**
  * This task synthesize an audio of a voice pronouncing a given text.
  * Recommended model: espnet/kan-bayashi_ljspeech_vits
  */
 export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<TextToSpeechOutput> {
-	const res = await request<TextToSpeechOutput>(args, {
+	const res = await request<Blob>(args, {
 		...options,
 		taskHint: "text-to-speech",
 	});
@@ -18,5 +21,5 @@ export async function textToSpeech(args: TextToSpeechArgs, options?: Options): P
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Blob");
 	}
-	return res;
+	return { audio: res };
 }
diff --git a/packages/inference/src/tasks/cv/imageToImage.ts b/packages/inference/src/tasks/cv/imageToImage.ts
index 5a0c83c7a1..b9bd49c195 100644
--- a/packages/inference/src/tasks/cv/imageToImage.ts
+++ b/packages/inference/src/tasks/cv/imageToImage.ts
@@ -34,5 +34,5 @@ export async function imageToImage(args: ImageToImageArgs, options?: Options): P
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Blob");
 	}
-	return res;
+	return { image: res };
 }
diff --git a/packages/inference/src/tasks/cv/textToImage.ts b/packages/inference/src/tasks/cv/textToImage.ts
index 9d9a654687..3c9f3db2a1 100644
--- a/packages/inference/src/tasks/cv/textToImage.ts
+++ b/packages/inference/src/tasks/cv/textToImage.ts
@@ -31,6 +31,7 @@ export async function textToImage(args: TextToImageArgs, options?: Options): Pro
 		...options,
 		taskHint: "text-to-image",
 	});
+	console.log(res);
 	if (res && typeof res === "object") {
 		if (args.provider === "fal-ai" && "images" in res && Array.isArray(res.images) && res.images[0].url) {
 			const image = await fetch(res.images[0].url);
diff --git a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
index 28689c334d..dee65bca47 100644
--- a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
@@ -20,9 +20,7 @@ export async function documentQuestionAnswering(
 		inputs: {
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
-			image: base64FromBytes(
-				new Uint8Array(await args.inputs.arrayBuffer())
-			),
+			image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer())),
 		},
 	} as RequestArgs;
 	const res = toArray(
diff --git a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
index e36008e8cc..12c682bfb2 100644
--- a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
@@ -19,9 +19,7 @@ export async function visualQuestionAnswering(
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
 			image: base64FromBytes(
-				new Uint8Array(
-					args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
-				)
+				new Uint8Array(args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.arrayBuffer())
 			),
 		},
 	} as RequestArgs;
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index 4a989462f8..dd488de7a6 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -451,7 +451,7 @@ describe.concurrent("HfInference", () => {
 						model: "espnet/kan-bayashi_ljspeech_vits",
 						inputs: "hello there!",
 					})
-				).toBeInstanceOf(Blob);
+				).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 
 			it("imageClassification", async () => {
@@ -543,14 +543,14 @@ describe.concurrent("HfInference", () => {
 					},
 					model: "lllyasviel/sd-controlnet-depth",
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 			it("imageToImage blob data", async () => {
 				const res = await hf.imageToImage({
 					inputs: new Blob([readTestFile("bird_canny.png")], { type: "image / png" }),
 					model: "lllyasviel/sd-controlnet-canny",
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 			it("textToImage", async () => {
 				const res = await hf.textToImage({
@@ -558,7 +558,7 @@ describe.concurrent("HfInference", () => {
 						"award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]",
 					model: "stabilityai/stable-diffusion-2",
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 
 			it("textToImage with parameters", async () => {
@@ -577,7 +577,7 @@ describe.concurrent("HfInference", () => {
 						num_inference_steps,
 					},
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 			it("imageToText", async () => {
 				expect(
@@ -781,14 +781,14 @@ describe.concurrent("HfInference", () => {
 					provider: "fal-ai",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 
 			it("speechToText", async () => {
 				const res = await client.automaticSpeechRecognition({
 					model: "openai/whisper-large-v3",
 					provider: "fal-ai",
-					data: new Blob([readTestFile("sample2.wav")], { type: "audio/x-wav" }),
+					inputs: new Blob([readTestFile("sample2.wav")], { type: "audio/x-wav" }),
 				});
 				expect(res).toMatchObject({
 					text: " he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca",
@@ -809,7 +809,7 @@ describe.concurrent("HfInference", () => {
 					provider: "replicate",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 
 			it("textToImage versioned", async () => {
@@ -818,7 +818,7 @@ describe.concurrent("HfInference", () => {
 					provider: "replicate",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 
 			it.skip("textToSpeech versioned", async () => {
@@ -827,7 +827,7 @@ describe.concurrent("HfInference", () => {
 					provider: "replicate",
 					inputs: "Hello, how are you?",
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 		},
 		TIMEOUT
@@ -904,7 +904,7 @@ describe.concurrent("HfInference", () => {
 					provider: "together",
 					inputs: "award winning high resolution photo of a giant tortoise",
 				});
-				expect(res).toBeInstanceOf(Blob);
+				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 
 			it("textGeneration", async () => {

From b193fb14c133e8e857535fce814df6ed4351a978 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 18:03:27 +0100
Subject: [PATCH 06/23] question answering test + inference code

---
 .../src/tasks/nlp/questionAnswering.ts        | 31 +++++++++++--------
 packages/inference/test/HfInference.spec.ts   | 20 ++++++------
 .../visual-question-answering/spec/input.json |  3 +-
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/packages/inference/src/tasks/nlp/questionAnswering.ts b/packages/inference/src/tasks/nlp/questionAnswering.ts
index 14f88da659..7afbfd9b2d 100644
--- a/packages/inference/src/tasks/nlp/questionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/questionAnswering.ts
@@ -12,23 +12,28 @@ export async function questionAnswering(
 	args: QuestionAnsweringArgs,
 	options?: Options
 ): Promise<QuestionAnsweringOutput> {
-	const res = await request<QuestionAnsweringOutput>(args, {
+	const res = await request<QuestionAnsweringOutput | QuestionAnsweringOutput[number]>(args, {
 		...options,
 		taskHint: "question-answering",
 	});
 	const isValidOutput =
-		Array.isArray(res) &&
-		res.every(
-			(elem) =>
-				typeof elem === "object" &&
-				!!elem &&
-				typeof elem.answer === "string" &&
-				typeof elem.end === "number" &&
-				typeof elem.score === "number" &&
-				typeof elem.start === "number"
-		);
+		Array.isArray(res) ?
+			res.every(
+				(elem) =>
+					typeof elem === "object" &&
+					!!elem &&
+					typeof elem.answer === "string" &&
+					typeof elem.end === "number" &&
+					typeof elem.score === "number" &&
+					typeof elem.start === "number"
+			) : (typeof res === "object" &&
+				!!res &&
+				typeof res.answer === "string" &&
+				typeof res.end === "number" &&
+				typeof res.score === "number" &&
+				typeof res.start === "number");
 	if (!isValidOutput) {
-		throw new InferenceOutputError("Expected {answer: string, end: number, score: number, start: number}");
+		throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
 	}
-	return res;
+	return Array.isArray(res) ? res : [res];
 }
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index dd488de7a6..3f68077fb2 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -80,20 +80,20 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("questionAnswering", async () => {
-				expect(
-					await hf.questionAnswering({
-						model: "deepset/roberta-base-squad2",
-						inputs: {
-							question: "What is the capital of France?",
-							context: "The capital of France is Paris.",
-						},
-					})
-				).toMatchObject({
+				const res = await hf.questionAnswering({
+					model: "deepset/roberta-base-squad2",
+					inputs: {
+						question: "What is the capital of France?",
+						context: "The capital of France is Paris.",
+					},
+				});
+
+				expect(res).toMatchObject([{
 					answer: "Paris",
 					score: expect.any(Number),
 					start: expect.any(Number),
 					end: expect.any(Number),
-				});
+				}]);
 			});
 
 			it("tableQuestionAnswering", async () => {
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index f28c0453fd..d51f35c379 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -15,7 +15,8 @@
 					"comment": "type=binary"
 				},
 				"question": {
-					"description": "The question to answer based on the image."
+					"description": "The question to answer based on the image.",
+					"type": "string"
 				}
 			},
 			"required": ["question", "image"]

From 699140edf11d4c56e228ef89b3d8a3cb6c406e09 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 18:32:23 +0100
Subject: [PATCH 07/23] wip

---
 .../inference/src/tasks/audio/textToSpeech.ts |  5 +-
 .../inference/src/tasks/cv/textToImage.ts     |  1 -
 .../multimodal/documentQuestionAnswering.ts   | 32 +++++--
 .../multimodal/visualQuestionAnswering.ts     | 30 ++++---
 .../src/tasks/nlp/questionAnswering.ts        | 18 ++--
 .../src/tasks/nlp/tableQuestionAnswering.ts   | 34 ++++---
 packages/inference/test/HfInference.spec.ts   | 90 ++++++++++---------
 .../src/tasks/text-to-speech/inference.ts     | 30 +++----
 .../src/tasks/text-to-speech/spec/output.json | 15 +++-
 .../visual-question-answering/inference.ts    |  2 +-
 10 files changed, 149 insertions(+), 108 deletions(-)

diff --git a/packages/inference/src/tasks/audio/textToSpeech.ts b/packages/inference/src/tasks/audio/textToSpeech.ts
index 7a6073dad0..f60ee367ed 100644
--- a/packages/inference/src/tasks/audio/textToSpeech.ts
+++ b/packages/inference/src/tasks/audio/textToSpeech.ts
@@ -1,13 +1,10 @@
-import type { TextToSpeechInput } from "@huggingface/tasks";
+import type { TextToSpeechInput, TextToSpeechOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
 type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
 
-interface TextToSpeechOutput {
-	audio: Blob;
-}
 /**
  * This task synthesize an audio of a voice pronouncing a given text.
  * Recommended model: espnet/kan-bayashi_ljspeech_vits
diff --git a/packages/inference/src/tasks/cv/textToImage.ts b/packages/inference/src/tasks/cv/textToImage.ts
index 3c9f3db2a1..9d9a654687 100644
--- a/packages/inference/src/tasks/cv/textToImage.ts
+++ b/packages/inference/src/tasks/cv/textToImage.ts
@@ -31,7 +31,6 @@ export async function textToImage(args: TextToImageArgs, options?: Options): Pro
 		...options,
 		taskHint: "text-to-image",
 	});
-	console.log(res);
 	if (res && typeof res === "object") {
 		if (args.provider === "fal-ai" && "images" in res && Array.isArray(res.images) && res.images[0].url) {
 			const image = await fetch(res.images[0].url);
diff --git a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
index dee65bca47..60d4aba008 100644
--- a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
@@ -4,9 +4,15 @@ import { request } from "../custom/request";
 import type { RequestArgs } from "../../types";
 import { toArray } from "../../utils/toArray";
 import { base64FromBytes } from "../../utils/base64FromBytes";
-import type { DocumentQuestionAnsweringInput, DocumentQuestionAnsweringOutput } from "@huggingface/tasks";
+import type {
+	DocumentQuestionAnsweringInput,
+	DocumentQuestionAnsweringInputData,
+	DocumentQuestionAnsweringOutput,
+} from "@huggingface/tasks";
 
-export type DocumentQuestionAnsweringArgs = BaseArgs & DocumentQuestionAnsweringInput;
+/// Override the type to properly set inputs.image as Blob
+export type DocumentQuestionAnsweringArgs = BaseArgs &
+	DocumentQuestionAnsweringInput & { inputs: DocumentQuestionAnsweringInputData & { image: Blob } };
 
 /**
  * Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
@@ -20,22 +26,30 @@ export async function documentQuestionAnswering(
 		inputs: {
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
-			image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer())),
+			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
 		},
 	} as RequestArgs;
 	const res = toArray(
-		await request<[DocumentQuestionAnsweringOutput] | DocumentQuestionAnsweringOutput>(reqArgs, {
+		await request<DocumentQuestionAnsweringOutput | DocumentQuestionAnsweringOutput[number]>(reqArgs, {
 			...options,
 			taskHint: "document-question-answering",
 		})
-	)?.[0];
+	);
+
 	const isValidOutput =
-		typeof res?.answer === "string" &&
-		(typeof res.end === "number" || typeof res.end === "undefined") &&
-		(typeof res.score === "number" || typeof res.score === "undefined") &&
-		(typeof res.start === "number" || typeof res.start === "undefined");
+		Array.isArray(res) &&
+		res.every(
+			(elem) =>
+				typeof elem === "object" &&
+				!!elem &&
+				typeof elem?.answer === "string" &&
+				(typeof elem.end === "number" || typeof elem.end === "undefined") &&
+				(typeof elem.score === "number" || typeof elem.score === "undefined") &&
+				(typeof elem.start === "number" || typeof elem.start === "undefined")
+		);
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, end?: number, score?: number, start?: number}>");
 	}
+
 	return res;
 }
diff --git a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
index 12c682bfb2..5e5767d161 100644
--- a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
@@ -1,10 +1,16 @@
-import type { VisualQuestionAnsweringInput, VisualQuestionAnsweringOutput } from "@huggingface/tasks";
+import type {
+	VisualQuestionAnsweringInput,
+	VisualQuestionAnsweringInputData,
+	VisualQuestionAnsweringOutput,
+} from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
 
-export type VisualQuestionAnsweringArgs = BaseArgs & VisualQuestionAnsweringInput;
+/// Override the type to properly set inputs.image as Blob
+export type VisualQuestionAnsweringArgs = BaseArgs &
+	VisualQuestionAnsweringInput & { inputs: VisualQuestionAnsweringInputData & { image: Blob } };
 
 /**
  * Answers a question on an image. Recommended model: dandelin/vilt-b32-finetuned-vqa.
@@ -18,18 +24,18 @@ export async function visualQuestionAnswering(
 		inputs: {
 			question: args.inputs.question,
 			// convert Blob or ArrayBuffer to base64
-			image: base64FromBytes(
-				new Uint8Array(args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.arrayBuffer())
-			),
+			image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
 		},
 	} as RequestArgs;
-	const res = (
-		await request<[VisualQuestionAnsweringOutput]>(reqArgs, {
-			...options,
-			taskHint: "visual-question-answering",
-		})
-	)?.[0];
-	const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
+	const res = await request<VisualQuestionAnsweringOutput>(reqArgs, {
+		...options,
+		taskHint: "visual-question-answering",
+	});
+	const isValidOutput =
+		Array.isArray(res) &&
+		res.every(
+			(elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && typeof elem.score === "number"
+		);
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
 	}
diff --git a/packages/inference/src/tasks/nlp/questionAnswering.ts b/packages/inference/src/tasks/nlp/questionAnswering.ts
index 7afbfd9b2d..4fc63fa505 100644
--- a/packages/inference/src/tasks/nlp/questionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/questionAnswering.ts
@@ -16,9 +16,8 @@ export async function questionAnswering(
 		...options,
 		taskHint: "question-answering",
 	});
-	const isValidOutput =
-		Array.isArray(res) ?
-			res.every(
+	const isValidOutput = Array.isArray(res)
+		? res.every(
 				(elem) =>
 					typeof elem === "object" &&
 					!!elem &&
@@ -26,12 +25,13 @@ export async function questionAnswering(
 					typeof elem.end === "number" &&
 					typeof elem.score === "number" &&
 					typeof elem.start === "number"
-			) : (typeof res === "object" &&
-				!!res &&
-				typeof res.answer === "string" &&
-				typeof res.end === "number" &&
-				typeof res.score === "number" &&
-				typeof res.start === "number");
+		  )
+		: typeof res === "object" &&
+		  !!res &&
+		  typeof res.answer === "string" &&
+		  typeof res.end === "number" &&
+		  typeof res.score === "number" &&
+		  typeof res.start === "number";
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
 	}
diff --git a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
index 217985d663..2d51d7e067 100644
--- a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
@@ -12,24 +12,34 @@ export async function tableQuestionAnswering(
 	args: TableQuestionAnsweringArgs,
 	options?: Options
 ): Promise<TableQuestionAnsweringOutput> {
-	const res = await request<TableQuestionAnsweringOutput>(args, {
+	const res = await request<TableQuestionAnsweringOutput | TableQuestionAnsweringOutput[number]>(args, {
 		...options,
 		taskHint: "table-question-answering",
 	});
-	const isValidOutput =
-		Array.isArray(res) &&
-		res.every((elem) => {
-			typeof elem?.aggregator === "string" &&
-				typeof elem.answer === "string" &&
-				Array.isArray(elem.cells) &&
-				elem.cells.every((x) => typeof x === "string") &&
-				Array.isArray(elem.coordinates) &&
-				elem.coordinates.every((coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number"));
-		});
+	const isValidOutput = Array.isArray(res) ? res.every((elem) => validate(elem)) : validate(res);
 	if (!isValidOutput) {
 		throw new InferenceOutputError(
 			"Expected {aggregator: string, answer: string, cells: string[], coordinates: number[][]}"
 		);
 	}
-	return res;
+	return Array.isArray(res) ? res : [res];
+}
+
+function validate(elem: unknown): elem is TableQuestionAnsweringOutput[number] {
+	return (
+		typeof elem === "object" &&
+		!!elem &&
+		"aggregator" in elem &&
+		typeof elem.aggregator === "string" &&
+		"answer" in elem &&
+		typeof elem.answer === "string" &&
+		"cells" in elem &&
+		Array.isArray(elem.cells) &&
+		elem.cells.every((x: unknown): x is string => typeof x === "string") &&
+		"coordinates" in elem &&
+		Array.isArray(elem.coordinates) &&
+		elem.coordinates.every(
+			(coord: unknown): coord is number[] => Array.isArray(coord) && coord.every((x) => typeof x === "number")
+		)
+	);
 }
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index 3f68077fb2..dfda4609fa 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -1,6 +1,6 @@
 import { expect, it, describe, assert } from "vitest";
 
-import type { ChatCompletionStreamOutput } from "@huggingface/tasks";
+import type { ChatCompletionStreamOutput, VisualQuestionAnsweringInput } from "@huggingface/tasks";
 
 import { chatCompletion, HfInference } from "../src";
 import "./vcr";
@@ -87,13 +87,14 @@ describe.concurrent("HfInference", () => {
 						context: "The capital of France is Paris.",
 					},
 				});
-
-				expect(res).toMatchObject([{
-					answer: "Paris",
-					score: expect.any(Number),
-					start: expect.any(Number),
-					end: expect.any(Number),
-				}]);
+				expect(res).toMatchObject([
+					{
+						answer: "Paris",
+						score: expect.any(Number),
+						start: expect.any(Number),
+						end: expect.any(Number),
+					},
+				]);
 			});
 
 			it("tableQuestionAnswering", async () => {
@@ -110,30 +111,31 @@ describe.concurrent("HfInference", () => {
 							},
 						},
 					})
-				).toMatchObject({
-					answer: "AVERAGE > 36542",
-					coordinates: [[0, 1]],
-					cells: ["36542"],
-					aggregator: "AVERAGE",
-				});
+				).toMatchObject([
+					{
+						answer: "AVERAGE > 36542",
+						coordinates: [[0, 1]],
+						cells: ["36542"],
+						aggregator: "AVERAGE",
+					},
+				]);
 			});
 
 			it("documentQuestionAnswering", async () => {
-				expect(
-					await hf.documentQuestionAnswering({
-						model: "impira/layoutlm-document-qa",
-						inputs: {
-							question: "Invoice number?",
-							image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
-						},
-					})
-				).toMatchObject({
-					answer: "us-001",
-					score: expect.any(Number),
-					// not sure what start/end refers to in this case
-					start: expect.any(Number),
-					end: expect.any(Number),
+				const res = await hf.documentQuestionAnswering({
+					model: "impira/layoutlm-document-qa",
+					inputs: {
+						question: "Invoice number?",
+						image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
+					},
 				});
+				expect(res).toBeInstanceOf(Array);
+				for (const elem of res) {
+					expect(elem).toMatchObject({
+						answer: expect.any(String),
+						score: expect.any(Number),
+					});
+				}
 			});
 
 			// Errors with "Error: If you are using a VisionEncoderDecoderModel, you must provide a feature extractor"
@@ -152,18 +154,20 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("visualQuestionAnswering", async () => {
-				expect(
-					await hf.visualQuestionAnswering({
-						model: "dandelin/vilt-b32-finetuned-vqa",
-						inputs: {
-							question: "How many cats are lying down?",
-							image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
-						},
-					})
-				).toMatchObject({
-					answer: "2",
-					score: expect.any(Number),
-				});
+				const res = await hf.visualQuestionAnswering({
+					model: "dandelin/vilt-b32-finetuned-vqa",
+					inputs: {
+						question: "How many cats are lying down?",
+						image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
+					},
+				} satisfies VisualQuestionAnsweringInput);
+				expect(res).toBeInstanceOf(Array);
+				for (const elem of res) {
+					expect(elem).toMatchObject({
+						answer: expect.any(String),
+						score: expect.any(Number),
+					});
+				}
 			});
 
 			it("textClassification", async () => {
@@ -451,7 +455,9 @@ describe.concurrent("HfInference", () => {
 						model: "espnet/kan-bayashi_ljspeech_vits",
 						inputs: "hello there!",
 					})
-				).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				).toMatchObject({
+					audio: expect.any(Blob),
+				});
 			});
 
 			it("imageClassification", async () => {
@@ -473,7 +479,7 @@ describe.concurrent("HfInference", () => {
 			it("zeroShotImageClassification", async () => {
 				expect(
 					await hf.zeroShotImageClassification({
-						inputs: { image: new Blob([readTestFile("cheetah.png")], { type: "image/png" }) },
+						inputs: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
 						model: "openai/clip-vit-large-patch14-336",
 						parameters: {
 							candidate_labels: ["animal", "toy", "car"],
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 50ea306b38..4515de301a 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,3 +1,17 @@
+/**
+ * Outputs of inference for the Text To Speech task
+ */
+export interface TextToSpeechOutput {
+	/**
+	 * The generated audio
+	 */
+	audio: Blob;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	sampling_rate?: number;
+	[property: string]: unknown;
+}
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
@@ -117,19 +131,3 @@ export interface GenerationParameters {
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-/**
- * Outputs for Text to Speech inference
- *
- * Outputs of inference for the Text To Audio task
- */
-export interface TextToSpeechOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	sampling_rate: number;
-	[property: string]: unknown;
-}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index 91654e2b50..4836ed246f 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -1,7 +1,18 @@
 {
-	"$ref": "/inference/schemas/text-to-audio/output.json",
 	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Speech task",
 	"title": "TextToSpeechOutput",
-	"description": "Outputs for Text to Speech inference"
+	"type": "object",
+	"properties": {
+		"audio": {
+			"description": "The generated audio",
+			"comment": "type=binary"
+		},
+		"sampling_rate": {
+			"type": "number",
+			"description": "The sampling rate of the generated audio waveform."
+		}
+	},
+	"required": ["audio"]
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 8b774988a3..3eb292c62d 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -28,7 +28,7 @@ export interface VisualQuestionAnsweringInputData {
 	/**
 	 * The question to answer based on the image.
 	 */
-	question: unknown;
+	question: string;
 	[property: string]: unknown;
 }
 /**

From 8741cd3ab917f42b8fbf04941dd963b20345296b Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 19:14:09 +0100
Subject: [PATCH 08/23] wip

---
 .../inference/src/lib/makeRequestOptions.ts   |   8 +-
 .../src/tasks/audio/audioClassification.ts    |   7 +-
 .../inference/src/tasks/audio/audioToAudio.ts |  75 +++--
 .../tasks/audio/automaticSpeechRecognition.ts |   8 +-
 .../src/tasks/cv/imageClassification.ts       |   7 +-
 .../src/tasks/cv/imageSegmentation.ts         |   7 +-
 .../inference/src/tasks/cv/imageToText.ts     |   7 +-
 .../inference/src/tasks/cv/objectDetection.ts |   7 +-
 .../src/tasks/nlp/sentenceSimilarity.ts       |  11 +-
 packages/inference/test/HfInference.spec.ts   | 263 ++++++++++------
 packages/inference/test/tapes.json            | 286 ++++++++++++++++++
 .../tasks-gen/scripts/inference-codegen.ts    |   6 +-
 .../src/tasks/image-to-image/inference.ts     |   4 +-
 .../src/tasks/image-to-image/spec/input.json  |   7 +-
 .../src/tasks/text-to-image/inference.ts      |   4 +-
 .../src/tasks/text-to-image/spec/input.json   |   7 +-
 16 files changed, 564 insertions(+), 150 deletions(-)

diff --git a/packages/inference/src/lib/makeRequestOptions.ts b/packages/inference/src/lib/makeRequestOptions.ts
index d6a28cd1fb..12daac2499 100644
--- a/packages/inference/src/lib/makeRequestOptions.ts
+++ b/packages/inference/src/lib/makeRequestOptions.ts
@@ -20,10 +20,7 @@ let tasks: Record<string, { models: { id: string }[] }> | null = null;
  * Helper that prepares request arguments
  */
 export async function makeRequestOptions(
-	args: RequestArgs & {
-		data?: Blob | ArrayBuffer;
-		stream?: boolean;
-	},
+	args: RequestArgs,
 	options?: Options & {
 		/** When a model can be used for multiple tasks, and we want to run a non-default task */
 		forceTask?: string | InferenceTask;
@@ -41,9 +38,6 @@ export async function makeRequestOptions(
 	if (endpointUrl && provider !== "hf-inference") {
 		throw new Error(`Cannot use endpointUrl with a third-party provider.`);
 	}
-	if (forceTask && provider !== "hf-inference") {
-		throw new Error(`Cannot use forceTask with a third-party provider.`);
-	}
 	if (maybeModel && isUrl(maybeModel)) {
 		throw new Error(`Model URLs are no longer supported. Use endpointUrl instead.`);
 	}
diff --git a/packages/inference/src/tasks/audio/audioClassification.ts b/packages/inference/src/tasks/audio/audioClassification.ts
index 5525be8b81..3bf02dafe5 100644
--- a/packages/inference/src/tasks/audio/audioClassification.ts
+++ b/packages/inference/src/tasks/audio/audioClassification.ts
@@ -2,6 +2,7 @@ import type { AudioClassificationInput, AudioClassificationOutput } from "@huggi
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+import { omit } from "../../utils/omit";
 
 export type AudioClassificationArgs = BaseArgs & AudioClassificationInput;
 
@@ -13,7 +14,11 @@ export async function audioClassification(
 	args: AudioClassificationArgs,
 	options?: Options
 ): Promise<AudioClassificationOutput> {
-	const res = await request<AudioClassificationOutput>(args, {
+	const payload = {
+		...omit(args, "inputs"),
+		data: args.inputs,
+	}
+	const res = await request<AudioClassificationOutput>(payload, {
 		...options,
 		taskHint: "audio-classification",
 	});
diff --git a/packages/inference/src/tasks/audio/audioToAudio.ts b/packages/inference/src/tasks/audio/audioToAudio.ts
index c339cdf61a..d80e9d8052 100644
--- a/packages/inference/src/tasks/audio/audioToAudio.ts
+++ b/packages/inference/src/tasks/audio/audioToAudio.ts
@@ -1,15 +1,16 @@
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
+import { omit } from "../../utils/omit";
 import { request } from "../custom/request";
 
 export type AudioToAudioArgs = BaseArgs & {
 	/**
 	 * Binary audio data
 	 */
-	data: Blob | ArrayBuffer;
+	inputs: Blob;
 };
 
-export interface AudioToAudioOutputValue {
+export interface AudioToAudioOutputElem {
 	/**
 	 * The label for the audio output (model specific)
 	 */
@@ -18,32 +19,70 @@ export interface AudioToAudioOutputValue {
 	/**
 	 * Base64 encoded audio output.
 	 */
-	blob: string;
+	audio: Blob;
+}
 
-	/**
-	 * Content-type for blob, e.g. audio/flac
-	 */
+export type AudioToAudioOutput = AudioToAudioOutputElem[];
+
+interface LegacyOutput {
+	blob: string;
 	"content-type": string;
+	label: string;
 }
 
-export type AudioToAudioReturn = AudioToAudioOutputValue[];
-
 /**
  * This task reads some audio input and outputs one or multiple audio files.
  * Example model: speechbrain/sepformer-wham does audio source separation.
  */
-export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioReturn> {
-	const res = await request<AudioToAudioReturn>(args, {
+export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput> {
+	const payload = {
+		...omit(args, "inputs"),
+		data: args.inputs,
+	};
+	const res = await request<AudioToAudioOutput | LegacyOutput[]>(payload, {
 		...options,
 		taskHint: "audio-to-audio",
 	});
-	const isValidOutput =
-		Array.isArray(res) &&
-		res.every(
-			(x) => typeof x.label === "string" && typeof x.blob === "string" && typeof x["content-type"] === "string"
-		);
-	if (!isValidOutput) {
-		throw new InferenceOutputError("Expected Array<{label: string, blob: string, content-type: string}>");
+
+	return validateOutput(res);
+}
+
+function validateOutput(output: unknown): AudioToAudioOutput {
+	if (!Array.isArray(output)) {
+		throw new InferenceOutputError("Expected Array");
+	}
+	if (
+		output.every((elem): elem is AudioToAudioOutputElem => {
+			return (
+				typeof elem === "object" &&
+				elem &&
+				"label" in elem &&
+				typeof elem.label === "string" &&
+				"audio" in elem &&
+				elem.audio instanceof Blob
+			);
+		})
+	) {
+		return output;
+	}
+	if (
+		output.every((elem): elem is LegacyOutput => {
+			return (
+				typeof elem === "object" &&
+				elem &&
+				"label" in elem &&
+				typeof elem.label === "string" &&
+				"content-type" in elem &&
+				typeof elem["content-type"] === "string" &&
+				"blob" in elem &&
+				typeof elem.blob === "string"
+			);
+		})
+	) {
+		return output.map((elem) => ({
+			label: elem.label,
+			audio: new Blob([elem.blob], { type: elem["content-type"] }),
+		}));
 	}
-	return res;
+	throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
 }
diff --git a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
index 9becd0937b..a98527323c 100644
--- a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
+++ b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
@@ -3,6 +3,7 @@ import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
+import { omit } from "../../utils/omit";
 
 export type AutomaticSpeechRecognitionArgs = BaseArgs & AutomaticSpeechRecognitionInput;
 /**
@@ -25,9 +26,12 @@ export async function automaticSpeechRecognition(
 		const base64audio = base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer()));
 		(args as AutomaticSpeechRecognitionArgs & { audio_url: string }).audio_url =
 			`data:${contentType};base64,${base64audio}`;
-		delete (args as Omit<AutomaticSpeechRecognitionArgs, "inputs"> & { inputs?: unknown }).inputs;
 	}
-	const res = await request<AutomaticSpeechRecognitionOutput>(args, {
+	const payload = {
+		...omit(args, "inputs"),
+		...(args.provider !== "fal-ai" ? { data: args.inputs } : undefined)
+	}
+	const res = await request<AutomaticSpeechRecognitionOutput>(payload as AutomaticSpeechRecognitionArgs, {
 		...options,
 		taskHint: "automatic-speech-recognition",
 	});
diff --git a/packages/inference/src/tasks/cv/imageClassification.ts b/packages/inference/src/tasks/cv/imageClassification.ts
index 41bfbdc20d..489cf866cf 100644
--- a/packages/inference/src/tasks/cv/imageClassification.ts
+++ b/packages/inference/src/tasks/cv/imageClassification.ts
@@ -2,6 +2,7 @@ import type { ImageClassificationInput, ImageClassificationOutput } from "@huggi
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+import { omit } from "../../utils/omit";
 
 export type ImageClassificationArgs = BaseArgs & ImageClassificationInput;
 
@@ -13,7 +14,11 @@ export async function imageClassification(
 	args: ImageClassificationArgs,
 	options?: Options
 ): Promise<ImageClassificationOutput> {
-	const res = await request<ImageClassificationOutput>(args, {
+	const payload = {
+		...omit(args, "inputs"),
+		data: args.inputs,
+	}
+	const res = await request<ImageClassificationOutput>(payload, {
 		...options,
 		taskHint: "image-classification",
 	});
diff --git a/packages/inference/src/tasks/cv/imageSegmentation.ts b/packages/inference/src/tasks/cv/imageSegmentation.ts
index 2b19d19e0d..772bf764d9 100644
--- a/packages/inference/src/tasks/cv/imageSegmentation.ts
+++ b/packages/inference/src/tasks/cv/imageSegmentation.ts
@@ -2,6 +2,7 @@ import type { ImageSegmentationInput, ImageSegmentationOutput } from "@huggingfa
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+import { omit } from "../../utils/omit";
 
 export type ImageSegmentationArgs = BaseArgs & ImageSegmentationInput;
 
@@ -13,7 +14,11 @@ export async function imageSegmentation(
 	args: ImageSegmentationArgs,
 	options?: Options
 ): Promise<ImageSegmentationOutput> {
-	const res = await request<ImageSegmentationOutput>(args, {
+	const payload = {
+		...omit(args, "inputs"),
+		data: args.inputs
+	}
+	const res = await request<ImageSegmentationOutput>(payload, {
 		...options,
 		taskHint: "image-segmentation",
 	});
diff --git a/packages/inference/src/tasks/cv/imageToText.ts b/packages/inference/src/tasks/cv/imageToText.ts
index 584142e9fd..9f8e1f21df 100644
--- a/packages/inference/src/tasks/cv/imageToText.ts
+++ b/packages/inference/src/tasks/cv/imageToText.ts
@@ -2,14 +2,19 @@ import type { ImageToTextInput, ImageToTextOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+import { omit } from "../../utils/omit";
 
 export type ImageToTextArgs = BaseArgs & ImageToTextInput;
 /**
  * This task reads some image input and outputs the text caption.
  */
 export async function imageToText(args: ImageToTextArgs, options?: Options): Promise<ImageToTextOutput> {
+	const payload = {
+		...omit(args, "inputs"),
+		data: args.inputs
+	}
 	const res = (
-		await request<[ImageToTextOutput]>(args, {
+		await request<[ImageToTextOutput]>(payload, {
 			...options,
 			taskHint: "image-to-text",
 		})
diff --git a/packages/inference/src/tasks/cv/objectDetection.ts b/packages/inference/src/tasks/cv/objectDetection.ts
index aed0102b1c..f8da40f292 100644
--- a/packages/inference/src/tasks/cv/objectDetection.ts
+++ b/packages/inference/src/tasks/cv/objectDetection.ts
@@ -2,6 +2,7 @@ import { request } from "../custom/request";
 import type { BaseArgs, Options } from "../../types";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { ObjectDetectionInput, ObjectDetectionOutput } from "@huggingface/tasks";
+import { omit } from "../../utils/omit";
 
 export type ObjectDetectionArgs = BaseArgs & ObjectDetectionInput;
 
@@ -10,7 +11,11 @@ export type ObjectDetectionArgs = BaseArgs & ObjectDetectionInput;
  * Recommended model: facebook/detr-resnet-50
  */
 export async function objectDetection(args: ObjectDetectionArgs, options?: Options): Promise<ObjectDetectionOutput> {
-	const res = await request<ObjectDetectionOutput>(args, {
+	const payload = {
+		...omit(args, "inputs"),
+		data: args.inputs,
+	}
+	const res = await request<ObjectDetectionOutput>(payload, {
 		...options,
 		taskHint: "object-detection",
 	});
diff --git a/packages/inference/src/tasks/nlp/sentenceSimilarity.ts b/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
index 7cd423fc45..efca0bc52c 100644
--- a/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
+++ b/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
@@ -3,6 +3,7 @@ import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import { getDefaultTask } from "../../lib/getDefaultTask";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+import { omit } from "../../utils/omit";
 
 export type SentenceSimilarityArgs = BaseArgs & SentenceSimilarityInput;
 
@@ -14,7 +15,7 @@ export async function sentenceSimilarity(
 	options?: Options
 ): Promise<SentenceSimilarityOutput> {
 	const defaultTask = args.model ? await getDefaultTask(args.model, args.accessToken, options) : undefined;
-	const res = await request<SentenceSimilarityOutput>(args, {
+	const res = await request<SentenceSimilarityOutput>(prepareInput(args), {
 		...options,
 		taskHint: "sentence-similarity",
 		...(defaultTask === "feature-extraction" && { forceTask: "sentence-similarity" }),
@@ -26,3 +27,11 @@ export async function sentenceSimilarity(
 	}
 	return res;
 }
+
+
+function prepareInput(args: SentenceSimilarityArgs) {
+	return {
+		...omit(args, "inputs"),
+		inputs: { ...args.inputs, source_sentence: args.inputs.sourceSentence }
+	}
+}
\ No newline at end of file
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index dfda4609fa..40c2ad8ba6 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -2,7 +2,38 @@ import { expect, it, describe, assert } from "vitest";
 
 import type { ChatCompletionStreamOutput, VisualQuestionAnsweringInput } from "@huggingface/tasks";
 
-import { chatCompletion, HfInference } from "../src";
+import {
+	audioClassification,
+	audioToAudio,
+	automaticSpeechRecognition,
+	chatCompletion,
+	chatCompletionStream,
+	documentQuestionAnswering,
+	featureExtraction,
+	fillMask,
+	HfInference,
+	imageClassification,
+	imageToImage,
+	imageToText,
+	objectDetection,
+	questionAnswering,
+	request,
+	sentenceSimilarity,
+	summarization,
+	tableQuestionAnswering,
+	tabularClassification,
+	tabularRegression,
+	textClassification,
+	textGeneration,
+	textGenerationStream,
+	textToImage,
+	textToSpeech,
+	tokenClassification,
+	translation,
+	visualQuestionAnswering,
+	zeroShotClassification,
+	zeroShotImageClassification,
+} from "../src";
 import "./vcr";
 import { readTestFile } from "./test-files";
 
@@ -19,21 +50,22 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"HF Inference",
 		() => {
-			const hf = new HfInference(env.HF_TOKEN);
 			it("throws error if model does not exist", () => {
 				expect(
-					hf.fillMask({
+					fillMask({
 						model: "this-model-does-not-exist-123",
 						inputs: "[MASK] world!",
+						accessToken: env.HF_TOKEN,
 					})
 				).rejects.toThrowError("Model this-model-does-not-exist-123 does not exist");
 			});
 
 			it("fillMask", async () => {
 				expect(
-					await hf.fillMask({
+					await fillMask({
 						model: "bert-base-uncased",
 						inputs: "[MASK] world!",
+						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -49,8 +81,9 @@ describe.concurrent("HfInference", () => {
 
 			it("works without model", async () => {
 				expect(
-					await hf.fillMask({
+					await fillMask({
 						inputs: "[MASK] world!",
+						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -66,13 +99,14 @@ describe.concurrent("HfInference", () => {
 
 			it("summarization", async () => {
 				expect(
-					await hf.summarization({
+					await summarization({
 						model: "google/pegasus-xsum",
 						inputs:
 							"The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930.",
 						parameters: {
 							max_length: 100,
 						},
+						accessToken: env.HF_TOKEN,
 					})
 				).toEqual({
 					summary_text: "The Eiffel Tower is one of the most famous buildings in the world.",
@@ -80,12 +114,13 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("questionAnswering", async () => {
-				const res = await hf.questionAnswering({
+				const res = await questionAnswering({
 					model: "deepset/roberta-base-squad2",
 					inputs: {
 						question: "What is the capital of France?",
 						context: "The capital of France is Paris.",
 					},
+					accessToken: env.HF_TOKEN,
 				});
 				expect(res).toMatchObject([
 					{
@@ -99,10 +134,10 @@ describe.concurrent("HfInference", () => {
 
 			it("tableQuestionAnswering", async () => {
 				expect(
-					await hf.tableQuestionAnswering({
+					await tableQuestionAnswering({
 						model: "google/tapas-base-finetuned-wtq",
 						inputs: {
-							query: "How many stars does the transformers repository have?",
+							question: "How many stars does the transformers repository have?",
 							table: {
 								Repository: ["Transformers", "Datasets", "Tokenizers"],
 								Stars: ["36542", "4512", "3934"],
@@ -110,6 +145,7 @@ describe.concurrent("HfInference", () => {
 								"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
 							},
 						},
+						accessToken: env.HF_TOKEN,
 					})
 				).toMatchObject([
 					{
@@ -122,12 +158,13 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("documentQuestionAnswering", async () => {
-				const res = await hf.documentQuestionAnswering({
+				const res = await documentQuestionAnswering({
 					model: "impira/layoutlm-document-qa",
 					inputs: {
 						question: "Invoice number?",
 						image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
 					},
+					accessToken: env.HF_TOKEN,
 				});
 				expect(res).toBeInstanceOf(Array);
 				for (const elem of res) {
@@ -141,12 +178,13 @@ describe.concurrent("HfInference", () => {
 			// Errors with "Error: If you are using a VisionEncoderDecoderModel, you must provide a feature extractor"
 			it.skip("documentQuestionAnswering with non-array output", async () => {
 				expect(
-					await hf.documentQuestionAnswering({
+					await documentQuestionAnswering({
 						model: "naver-clova-ix/donut-base-finetuned-docvqa",
 						inputs: {
 							question: "Invoice number?",
 							image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
 						},
+						accessToken: env.HF_TOKEN,
 					})
 				).toMatchObject({
 					answer: "us-001",
@@ -154,12 +192,13 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("visualQuestionAnswering", async () => {
-				const res = await hf.visualQuestionAnswering({
+				const res = await visualQuestionAnswering({
 					model: "dandelin/vilt-b32-finetuned-vqa",
 					inputs: {
 						question: "How many cats are lying down?",
 						image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
 					},
+					accessToken: env.HF_TOKEN,
 				} satisfies VisualQuestionAnsweringInput);
 				expect(res).toBeInstanceOf(Array);
 				for (const elem of res) {
@@ -172,9 +211,10 @@ describe.concurrent("HfInference", () => {
 
 			it("textClassification", async () => {
 				expect(
-					await hf.textClassification({
+					await textClassification({
 						model: "distilbert-base-uncased-finetuned-sst-2-english",
 						inputs: "I like you. I love you.",
+						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -188,7 +228,8 @@ describe.concurrent("HfInference", () => {
 
 			it("textGeneration - gpt2", async () => {
 				expect(
-					await hf.textGeneration({
+					await textGeneration({
+						accessToken: env.HF_TOKEN,
 						model: "gpt2",
 						inputs: "The answer to the universe is",
 					})
@@ -199,7 +240,8 @@ describe.concurrent("HfInference", () => {
 
 			it("textGeneration - openai-community/gpt2", async () => {
 				expect(
-					await hf.textGeneration({
+					await textGeneration({
+						accessToken: env.HF_TOKEN,
 						model: "openai-community/gpt2",
 						inputs: "The answer to the universe is",
 					})
@@ -209,7 +251,8 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("textGenerationStream - meta-llama/Llama-3.2-3B", async () => {
-				const response = hf.textGenerationStream({
+				const response = textGenerationStream({
+					accessToken: env.HF_TOKEN,
 					model: "meta-llama/Llama-3.2-3B",
 					inputs: "Please answer the following question: complete one two and ____.",
 					parameters: {
@@ -236,7 +279,8 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("textGenerationStream - catch error", async () => {
-				const response = hf.textGenerationStream({
+				const response = textGenerationStream({
+					accessToken: env.HF_TOKEN,
 					model: "meta-llama/Llama-3.2-3B",
 					inputs: "Write a short story about a robot that becomes sentient and takes over the world.",
 					parameters: {
@@ -251,8 +295,9 @@ describe.concurrent("HfInference", () => {
 
 			it.skip("textGenerationStream - Abort", async () => {
 				const controller = new AbortController();
-				const response = hf.textGenerationStream(
+				const response = textGenerationStream(
 					{
+						accessToken: env.HF_TOKEN,
 						model: "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
 						inputs: "Write an essay about Sartre's philosophy.",
 						parameters: {
@@ -269,7 +314,8 @@ describe.concurrent("HfInference", () => {
 
 			it("tokenClassification", async () => {
 				expect(
-					await hf.tokenClassification({
+					await tokenClassification({
+						accessToken: env.HF_TOKEN,
 						model: "dbmdz/bert-large-cased-finetuned-conll03-english",
 						inputs: "My name is Sarah Jessica Parker but you can call me Jessica",
 					})
@@ -288,36 +334,23 @@ describe.concurrent("HfInference", () => {
 
 			it("translation", async () => {
 				expect(
-					await hf.translation({
+					await translation({
+						accessToken: env.HF_TOKEN,
 						model: "t5-base",
 						inputs: "My name is Wolfgang and I live in Berlin",
 					})
 				).toMatchObject({
 					translation_text: "Mein Name ist Wolfgang und ich lebe in Berlin",
 				});
-				// input is a list
-				expect(
-					await hf.translation({
-						model: "t5-base",
-						inputs: ["My name is Wolfgang and I live in Berlin", "I work as programmer"],
-					})
-				).toMatchObject([
-					{
-						translation_text: "Mein Name ist Wolfgang und ich lebe in Berlin",
-					},
-					{
-						translation_text: "Ich arbeite als Programmierer",
-					},
-				]);
 			});
 			it("zeroShotClassification", async () => {
 				expect(
-					await hf.zeroShotClassification({
+					await zeroShotClassification({
 						model: "facebook/bart-large-mnli",
-						inputs: [
+						inputs:
 							"Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!",
-						],
 						parameters: { candidate_labels: ["refund", "legal", "faq"] },
+						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -334,26 +367,29 @@ describe.concurrent("HfInference", () => {
 					])
 				);
 			});
-			it("SentenceSimilarity", async () => {
+			it("sentenceSimilarity", async () => {
 				expect(
-					await hf.sentenceSimilarity({
+					await sentenceSimilarity({
+						accessToken: env.HF_TOKEN,
 						model: "sentence-transformers/paraphrase-xlm-r-multilingual-v1",
 						inputs: {
-							source_sentence: "That is a happy person",
+							sourceSentence: "That is a happy person",
 							sentences: ["That is a happy dog", "That is a very happy person", "Today is a sunny day"],
 						},
 					})
 				).toEqual([expect.any(Number), expect.any(Number), expect.any(Number)]);
 			});
 			it("FeatureExtraction", async () => {
-				const response = await hf.featureExtraction({
+				const response = await featureExtraction({
+					accessToken: env.HF_TOKEN,
 					model: "sentence-transformers/distilbert-base-nli-mean-tokens",
 					inputs: "That is a happy person",
 				});
 				expect(response).toEqual(expect.arrayContaining([expect.any(Number)]));
 			});
 			it("FeatureExtraction - same model as sentence similarity", async () => {
-				const response = await hf.featureExtraction({
+				const response = await featureExtraction({
+					accessToken: env.HF_TOKEN,
 					model: "sentence-transformers/paraphrase-xlm-r-multilingual-v1",
 					inputs: "That is a happy person",
 				});
@@ -362,7 +398,8 @@ describe.concurrent("HfInference", () => {
 				expect(response).toEqual(expect.arrayContaining([expect.any(Number)]));
 			});
 			it("FeatureExtraction - facebook/bart-base", async () => {
-				const response = await hf.featureExtraction({
+				const response = await featureExtraction({
+					accessToken: env.HF_TOKEN,
 					model: "facebook/bart-base",
 					inputs: "That is a happy person",
 				});
@@ -380,7 +417,8 @@ describe.concurrent("HfInference", () => {
 				]);
 			});
 			it("FeatureExtraction - facebook/bart-base, list input", async () => {
-				const response = await hf.featureExtraction({
+				const response = await featureExtraction({
+					accessToken: env.HF_TOKEN,
 					model: "facebook/bart-base",
 					inputs: ["hello", "That is a happy person"],
 				});
@@ -408,9 +446,10 @@ describe.concurrent("HfInference", () => {
 			});
 			it("automaticSpeechRecognition", async () => {
 				expect(
-					await hf.automaticSpeechRecognition({
+					await automaticSpeechRecognition({
+						accessToken: env.HF_TOKEN,
 						model: "facebook/wav2vec2-large-960h-lv60-self",
-						data: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
+						inputs: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
 					})
 				).toMatchObject({
 					text: "GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES IN DRAUGHTY SCHOOLROOMS DAY AFTER DAY FOR A FORTNIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS",
@@ -418,9 +457,10 @@ describe.concurrent("HfInference", () => {
 			});
 			it("audioClassification", async () => {
 				expect(
-					await hf.audioClassification({
+					await audioClassification({
 						model: "superb/hubert-large-superb-er",
-						data: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
+						inputs: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
+						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -434,16 +474,20 @@ describe.concurrent("HfInference", () => {
 
 			it("audioToAudio", async () => {
 				expect(
-					await hf.audioToAudio({
+					await audioToAudio({
 						model: "speechbrain/sepformer-wham",
-						data: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
+						accessToken: env.HF_TOKEN,
+						inputs: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
 					})
 				).toEqual(
 					expect.arrayContaining([
 						expect.objectContaining({
 							label: expect.any(String),
-							blob: expect.any(String),
-							"content-type": expect.any(String),
+							audio: expect.any(Blob),
+						}),
+						expect.objectContaining({
+							label: expect.any(String),
+							audio: expect.any(Blob),
 						}),
 					])
 				);
@@ -451,7 +495,8 @@ describe.concurrent("HfInference", () => {
 
 			it("textToSpeech", async () => {
 				expect(
-					await hf.textToSpeech({
+					await textToSpeech({
+						accessToken: env.HF_TOKEN,
 						model: "espnet/kan-bayashi_ljspeech_vits",
 						inputs: "hello there!",
 					})
@@ -462,8 +507,9 @@ describe.concurrent("HfInference", () => {
 
 			it("imageClassification", async () => {
 				expect(
-					await hf.imageClassification({
-						data: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
+					await imageClassification({
+						accessToken: env.HF_TOKEN,
+						inputs: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
 						model: "google/vit-base-patch16-224",
 					})
 				).toEqual(
@@ -478,7 +524,8 @@ describe.concurrent("HfInference", () => {
 
 			it("zeroShotImageClassification", async () => {
 				expect(
-					await hf.zeroShotImageClassification({
+					await zeroShotImageClassification({
+						accessToken: env.HF_TOKEN,
 						inputs: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
 						model: "openai/clip-vit-large-patch14-336",
 						parameters: {
@@ -503,8 +550,9 @@ describe.concurrent("HfInference", () => {
 
 			it("objectDetection", async () => {
 				expect(
-					await hf.imageClassification({
-						data: new Blob([readTestFile("cats.png")], { type: "image/png" }),
+					await objectDetection({
+						accessToken: env.HF_TOKEN,
+						inputs: new Blob([readTestFile("cats.png")], { type: "image/png" }),
 						model: "facebook/detr-resnet-50",
 					})
 				).toEqual(
@@ -524,8 +572,9 @@ describe.concurrent("HfInference", () => {
 			});
 			it("imageSegmentation", async () => {
 				expect(
-					await hf.imageClassification({
-						data: new Blob([readTestFile("cats.png")], { type: "image/png" }),
+					await imageClassification({
+						accessToken: env.HF_TOKEN,
+						inputs: new Blob([readTestFile("cats.png")], { type: "image/png" }),
 						model: "facebook/detr-resnet-50-panoptic",
 					})
 				).toEqual(
@@ -541,8 +590,9 @@ describe.concurrent("HfInference", () => {
 			it("imageToImage", async () => {
 				const num_inference_steps = 25;
 
-				const res = await hf.imageToImage({
-					inputs: new Blob([readTestFile("stormtrooper_depth.png")], { type: "image / png" }),
+				const res = await imageToImage({
+					accessToken: env.HF_TOKEN,
+					inputs: new Blob([readTestFile("stormtrooper_depth.png")], { type: "image/png" }),
 					parameters: {
 						prompt: "elmo's lecture",
 						num_inference_steps,
@@ -552,14 +602,16 @@ describe.concurrent("HfInference", () => {
 				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 			it("imageToImage blob data", async () => {
-				const res = await hf.imageToImage({
+				const res = await imageToImage({
+					accessToken: env.HF_TOKEN,
 					inputs: new Blob([readTestFile("bird_canny.png")], { type: "image / png" }),
 					model: "lllyasviel/sd-controlnet-canny",
 				});
 				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
 			});
 			it("textToImage", async () => {
-				const res = await hf.textToImage({
+				const res = await textToImage({
+					accessToken: env.HF_TOKEN,
 					inputs:
 						"award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]",
 					model: "stabilityai/stable-diffusion-2",
@@ -572,7 +624,8 @@ describe.concurrent("HfInference", () => {
 				const height = 128;
 				const num_inference_steps = 10;
 
-				const res = await hf.textToImage({
+				const res = await textToImage({
+					accessToken: env.HF_TOKEN,
 					inputs:
 						"award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]",
 					model: "stabilityai/stable-diffusion-2",
@@ -587,8 +640,9 @@ describe.concurrent("HfInference", () => {
 			});
 			it("imageToText", async () => {
 				expect(
-					await hf.imageToText({
-						data: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
+					await imageToText({
+						accessToken: env.HF_TOKEN,
+						inputs: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
 						model: "nlpconnect/vit-gpt2-image-captioning",
 					})
 				).toEqual({
@@ -597,7 +651,8 @@ describe.concurrent("HfInference", () => {
 			});
 			it("request - openai-community/gpt2", async () => {
 				expect(
-					await hf.request({
+					await request({
+						accessToken: env.HF_TOKEN,
 						model: "openai-community/gpt2",
 						inputs: "one plus two equals",
 					})
@@ -611,7 +666,8 @@ describe.concurrent("HfInference", () => {
 			// Skipped at the moment because takes forever
 			it.skip("tabularRegression", async () => {
 				expect(
-					await hf.tabularRegression({
+					await tabularRegression({
+						accessToken: env.HF_TOKEN,
 						model: "scikit-learn/Fish-Weight",
 						inputs: {
 							data: {
@@ -630,7 +686,8 @@ describe.concurrent("HfInference", () => {
 			// Skipped at the moment because takes forever
 			it.skip("tabularClassification", async () => {
 				expect(
-					await hf.tabularClassification({
+					await tabularClassification({
+						accessToken: env.HF_TOKEN,
 						model: "vvmnnnkv/wine-quality",
 						inputs: {
 							data: {
@@ -651,16 +708,19 @@ describe.concurrent("HfInference", () => {
 				).toMatchObject([5, 5, 7]);
 			});
 
+			const hf = new HfInference();
 			it("endpoint - makes request to specified endpoint", async () => {
 				const ep = hf.endpoint("https://api-inference.huggingface.co/models/openai-community/gpt2");
 				const { generated_text } = await ep.textGeneration({
 					inputs: "one plus two equals",
+					accessToken: env.HF_TOKEN,
 				});
 				assert.include(generated_text, "three");
 			});
 
 			it("chatCompletion modelId - OpenAI Specs", async () => {
-				const res = await hf.chatCompletion({
+				const res = await chatCompletion({
+					accessToken: env.HF_TOKEN,
 					model: "mistralai/Mistral-7B-Instruct-v0.2",
 					messages: [{ role: "user", content: "Complete the this sentence with words one plus one is equal " }],
 					max_tokens: 500,
@@ -674,7 +734,8 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("chatCompletionStream modelId - OpenAI Specs", async () => {
-				const stream = hf.chatCompletionStream({
+				const stream = chatCompletionStream({
+					accessToken: env.HF_TOKEN,
 					model: "mistralai/Mistral-7B-Instruct-v0.2",
 					messages: [{ role: "user", content: "Complete the equation 1+1= ,just the answer" }],
 					max_tokens: 500,
@@ -692,15 +753,14 @@ describe.concurrent("HfInference", () => {
 
 			it("chatCompletionStream modelId Fail - OpenAI Specs", async () => {
 				expect(
-					hf
-						.chatCompletionStream({
-							model: "google/gemma-2b",
-							messages: [{ role: "user", content: "Complete the equation 1+1= ,just the answer" }],
-							max_tokens: 500,
-							temperature: 0.1,
-							seed: 0,
-						})
-						.next()
+					chatCompletionStream({
+						accessToken: env.HF_TOKEN,
+						model: "google/gemma-2b",
+						messages: [{ role: "user", content: "Complete the equation 1+1= ,just the answer" }],
+						max_tokens: 500,
+						temperature: 0.1,
+						seed: 0,
+					}).next()
 				).rejects.toThrowError(
 					"Server google/gemma-2b does not seem to support chat completion. Error: Template error: template not found"
 				);
@@ -779,10 +839,9 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"Fal AI",
 		() => {
-			const client = new HfInference(env.HF_FAL_KEY);
-
 			it("textToImage", async () => {
-				const res = await client.textToImage({
+				const res = await textToImage({
+					accessToken: env.HF_FAL_KEY,
 					model: "black-forest-labs/FLUX.1-schnell",
 					provider: "fal-ai",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
@@ -791,7 +850,8 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("speechToText", async () => {
-				const res = await client.automaticSpeechRecognition({
+				const res = await automaticSpeechRecognition({
+					accessToken: env.HF_FAL_KEY,
 					model: "openai/whisper-large-v3",
 					provider: "fal-ai",
 					inputs: new Blob([readTestFile("sample2.wav")], { type: "audio/x-wav" }),
@@ -807,10 +867,9 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"Replicate",
 		() => {
-			const client = new HfInference(env.HF_REPLICATE_KEY);
-
 			it("textToImage canonical", async () => {
-				const res = await client.textToImage({
+				const res = await textToImage({
+					accessToken: env.HF_REPLICATE_KEY,
 					model: "black-forest-labs/FLUX.1-schnell",
 					provider: "replicate",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
@@ -819,7 +878,8 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("textToImage versioned", async () => {
-				const res = await client.textToImage({
+				const res = await textToImage({
+					accessToken: env.HF_REPLICATE_KEY,
 					model: "ByteDance/SDXL-Lightning",
 					provider: "replicate",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
@@ -828,7 +888,8 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it.skip("textToSpeech versioned", async () => {
-				const res = await client.textToSpeech({
+				const res = await textToSpeech({
+					accessToken: env.HF_REPLICATE_KEY,
 					model: "SWivid/F5-TTS",
 					provider: "replicate",
 					inputs: "Hello, how are you?",
@@ -841,10 +902,9 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"SambaNova",
 		() => {
-			const client = new HfInference(env.HF_SAMBANOVA_KEY);
-
 			it("chatCompletion", async () => {
-				const res = await client.chatCompletion({
+				const res = await chatCompletion({
+					accessToken: env.HF_SAMBANOVA_KEY,
 					model: "meta-llama/Llama-3.1-8B-Instruct",
 					provider: "sambanova",
 					messages: [{ role: "user", content: "Complete this sentence with words, one plus one is equal " }],
@@ -855,7 +915,8 @@ describe.concurrent("HfInference", () => {
 				}
 			});
 			it("chatCompletion stream", async () => {
-				const stream = client.chatCompletionStream({
+				const stream = chatCompletionStream({
+					accessToken: env.HF_SAMBANOVA_KEY,
 					model: "meta-llama/Llama-3.1-8B-Instruct",
 					provider: "sambanova",
 					messages: [{ role: "user", content: "Complete the equation 1 + 1 = , just the answer" }],
@@ -875,10 +936,9 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"Together",
 		() => {
-			const client = new HfInference(env.HF_TOGETHER_KEY);
-
 			it("chatCompletion", async () => {
-				const res = await client.chatCompletion({
+				const res = await chatCompletion({
+					accessToken: env.HF_TOGETHER_KEY,
 					model: "meta-llama/Llama-3.3-70B-Instruct",
 					provider: "together",
 					messages: [{ role: "user", content: "Complete this sentence with words, one plus one is equal " }],
@@ -890,7 +950,8 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("chatCompletion stream", async () => {
-				const stream = client.chatCompletionStream({
+				const stream = chatCompletionStream({
+					accessToken: env.HF_TOGETHER_KEY,
 					model: "meta-llama/Llama-3.3-70B-Instruct",
 					provider: "together",
 					messages: [{ role: "user", content: "Complete the equation 1 + 1 = , just the answer" }],
@@ -905,8 +966,9 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("textToImage", async () => {
-				const res = await client.textToImage({
+				const res = await textToImage({
 					model: "stabilityai/stable-diffusion-xl-base-1.0",
+					accessToken: env.HF_TOGETHER_KEY,
 					provider: "together",
 					inputs: "award winning high resolution photo of a giant tortoise",
 				});
@@ -914,7 +976,8 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("textGeneration", async () => {
-				const res = await client.textGeneration({
+				const res = await textGeneration({
+					accessToken: env.HF_TOGETHER_KEY,
 					model: "mistralai/Mixtral-8x7B-v0.1",
 					provider: "together",
 					inputs: "Paris is",
diff --git a/packages/inference/test/tapes.json b/packages/inference/test/tapes.json
index 4f5a67e0b1..64adc4ee80 100644
--- a/packages/inference/test/tapes.json
+++ b/packages/inference/test/tapes.json
@@ -4283,5 +4283,291 @@
         "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
       }
     }
+  },
+  "4cb2d48bab5adad32d2389ad9e40f94aa14a2fab6f68af31a662697e1073afcf": {
+    "url": "https://api-inference.huggingface.co/models/facebook/bart-large-mnli",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":\"Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!\",\"parameters\":{\"candidate_labels\":[\"refund\",\"legal\",\"faq\"]}}"
+    },
+    "response": {
+      "body": "{\"sequence\":\"Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!\",\"labels\":[\"refund\",\"faq\",\"legal\"],\"scores\":[0.8777874112129211,0.10522667318582535,0.01698591560125351]}",
+      "status": 200,
+      "statusText": "OK",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "access-control-expose-headers": "x-compute-type, x-compute-time",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "407c9eae19e4763e292d1bf6f3b67ac76c0042091fbe85758d526d08f8c55eb1": {
+    "url": "https://api-inference.huggingface.co/models/google/vit-base-patch16-224",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{}}"
+    },
+    "response": {
+      "body": "{\"error\":[\"Error in `inputs`: Invalid image: {}\"]}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "63a14c4db8cb2c34ed3fabd4d65eaa9657e844368fea8990b4cbeee198db2133": {
+    "url": "https://api-inference.huggingface.co/models/facebook/detr-resnet-50",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{}}"
+    },
+    "response": {
+      "body": "{\"error\":\"Please log in or use a HF access token\"}",
+      "status": 429,
+      "statusText": "Too Many Requests",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "4b27a9c8870792397aee3df4876be9ebc507c3a2fc9e828e13bb81c40bf7cd84": {
+    "url": "https://api-inference.huggingface.co/models/speechbrain/sepformer-wham",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{}}"
+    },
+    "response": {
+      "body": "{\"error\":\"Malformed soundfile\"}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "0388b6333e7c6df163ac27e69352400261ee87ee8a692fe80d0f8896698d6e7c": {
+    "url": "https://api-inference.huggingface.co/models/facebook/detr-resnet-50-panoptic",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{}}"
+    },
+    "response": {
+      "body": "{\"error\":[\"Error in `inputs`: Invalid image: {}\"]}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "97b2e862f687bc824468e7a93e9f87b1b8a995642d3f6f5769c06e6ccd6f5b4c": {
+    "url": "https://api-inference.huggingface.co/models/nlpconnect/vit-gpt2-image-captioning",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{}}"
+    },
+    "response": {
+      "body": "{\"error\":[\"Error in `inputs`: Invalid image: {}\"]}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "1d5c927a794ec10ea1fc7fb4e729047cf0282799c0a434bce9d929660488a0f2": {
+    "url": "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":\"award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]\",\"parameters\":{\"negative_prompt\":[\"blurry\"],\"width\":512,\"height\":128,\"num_inference_steps\":10}}"
+    },
+    "response": {
+      "body": "{\"error\":\"`negative_prompt` should be the same type to `prompt`, but got <class 'list'> != <class 'str'>.\"}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "access-control-expose-headers": "x-compute-type, x-compute-time",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "9f068aceef2b0ad23973913eb1a18782be87daf7e8345c57fb68a7eb575de20e": {
+    "url": "https://api-inference.huggingface.co/models/facebook/wav2vec2-large-960h-lv60-self",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{}}"
+    },
+    "response": {
+      "body": "{\"error\":[\"Error in `inputs`: Malformed soundfile\"]}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "201e762da947b645abe67e466af800afb936ec116a51a03e9b89343faaf7bc01": {
+    "url": "https://api-inference.huggingface.co/models/superb/hubert-large-superb-er",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{}}"
+    },
+    "response": {
+      "body": "{\"error\":[\"Error in `inputs`: Malformed soundfile\"]}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "41d6b86bf2438eb25338b4572ad63cc23319154c9e44c23f53a079fbc48862f2": {
+    "url": "https://api-inference.huggingface.co/models/sentence-transformers/paraphrase-xlm-r-multilingual-v1",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{\"sourceSentence\":\"That is a happy person\",\"sentences\":[\"That is a happy dog\",\"That is a very happy person\",\"Today is a sunny day\"]}}"
+    },
+    "response": {
+      "body": "{\"error\":[\"Field required: received `source_sentence` in `parameters`\"]}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "80ef172601f7da76a98276d2a1af8b603caf29350fc981b3de7d327c87e0126c": {
+    "url": "https://api-inference.huggingface.co/models/google/tapas-base-finetuned-wtq",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{\"question\":\"How many stars does the transformers repository have?\",\"table\":{\"Repository\":[\"Transformers\",\"Datasets\",\"Tokenizers\"],\"Stars\":[\"36542\",\"4512\",\"3934\"],\"Contributors\":[\"651\",\"77\",\"34\"],\"Programming language\":[\"Python\",\"Python\",\"Rust, Python and NodeJS\"]}}}"
+    },
+    "response": {
+      "body": "{\"error\":\"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but is <class 'dict'>)\",\"warnings\":[\"There was an inference error: Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but is <class 'dict'>)\"]}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "access-control-expose-headers": "x-compute-type, x-compute-time",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
+  },
+  "cc7951fc493e082384f8839058a329eabaa7a13830bdf1eb03d87bc73b3fc327": {
+    "url": "https://api-inference.huggingface.co/models/sentence-transformers/paraphrase-xlm-r-multilingual-v1",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{\"sourceSentence\":\"That is a happy person\",\"sentences\":[\"That is a happy dog\",\"That is a very happy person\",\"Today is a sunny day\"],\"source_sentence\":\"That is a happy person\"}}"
+    },
+    "response": {
+      "body": "[0.6623499989509583,0.9382342100143433,0.2296333760023117]",
+      "status": 200,
+      "statusText": "OK",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "access-control-expose-headers": "x-compute-type, x-compute-time",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
   }
 }
\ No newline at end of file
diff --git a/packages/tasks-gen/scripts/inference-codegen.ts b/packages/tasks-gen/scripts/inference-codegen.ts
index ae1ac72b11..2273a4cd6d 100644
--- a/packages/tasks-gen/scripts/inference-codegen.ts
+++ b/packages/tasks-gen/scripts/inference-codegen.ts
@@ -95,8 +95,8 @@ async function generatePython(inputData: InputData): Promise<SerializedRenderRes
 }
 
 interface JSONSchemaSpec {
-	[param: string]: string | JSONSchemaSpec
-};
+	[param: string]: string | JSONSchemaSpec;
+}
 
 async function postProcessOutput(
 	path2generated: string,
@@ -146,7 +146,7 @@ async function generateBinaryInputTypes(
 			}
 			const propName = propSignature.name.getText(tsSource);
 			const propIsMedia = !!spec["properties"]?.[propName]?.["comment"]?.includes("type=binary");
-			console.log(propName, propIsMedia)
+			console.log(propName, propIsMedia);
 			if (!propIsMedia) {
 				return;
 			}
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index 9b5d776903..eeafdbcebf 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -28,9 +28,9 @@ export interface ImageToImageParameters {
 	 */
 	guidance_scale?: number;
 	/**
-	 * One or several prompt to guide what NOT to include in image generation.
+	 * One prompt to guide what NOT to include in image generation.
 	 */
-	negative_prompt?: string[];
+	negative_prompt?: string;
 	/**
 	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
 	 * a higher quality image at the expense of slower inference.
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index 431b815ef4..889e7bb1da 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -25,11 +25,8 @@
 					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
 				},
 				"negative_prompt": {
-					"type": "array",
-					"items": {
-						"type": "string"
-					},
-					"description": "One or several prompt to guide what NOT to include in image generation."
+					"type": "string",
+					"description": "One prompt to guide what NOT to include in image generation."
 				},
 				"num_inference_steps": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/text-to-image/inference.ts b/packages/tasks/src/tasks/text-to-image/inference.ts
index 0770cf6a63..562bcaecfe 100644
--- a/packages/tasks/src/tasks/text-to-image/inference.ts
+++ b/packages/tasks/src/tasks/text-to-image/inference.ts
@@ -27,9 +27,9 @@ export interface TextToImageParameters {
 	 */
 	guidance_scale?: number;
 	/**
-	 * One or several prompt to guide what NOT to include in image generation.
+	 * One prompt to guide what NOT to include in image generation.
 	 */
-	negative_prompt?: string[];
+	negative_prompt?: string;
 	/**
 	 * The number of denoising steps. More denoising steps usually lead to a higher quality
 	 * image at the expense of slower inference.
diff --git a/packages/tasks/src/tasks/text-to-image/spec/input.json b/packages/tasks/src/tasks/text-to-image/spec/input.json
index bc22f88c20..4e430073d8 100644
--- a/packages/tasks/src/tasks/text-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-image/spec/input.json
@@ -24,11 +24,8 @@
 					"description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
 				},
 				"negative_prompt": {
-					"type": "array",
-					"items": {
-						"type": "string"
-					},
-					"description": "One or several prompt to guide what NOT to include in image generation."
+					"type": "string",
+					"description": "One prompt to guide what NOT to include in image generation."
 				},
 				"num_inference_steps": {
 					"type": "integer",

From b2b1073d941fd658e662c6aad11dc41afdda6a0c Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 19:26:53 +0100
Subject: [PATCH 09/23] wip

---
 .../inference/src/lib/makeRequestOptions.ts   | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/packages/inference/src/lib/makeRequestOptions.ts b/packages/inference/src/lib/makeRequestOptions.ts
index ea607b197e..1643ac9c9a 100644
--- a/packages/inference/src/lib/makeRequestOptions.ts
+++ b/packages/inference/src/lib/makeRequestOptions.ts
@@ -61,21 +61,21 @@ export async function makeRequestOptions(
 			? "hf-token"
 			: "provider-key"
 		: includeCredentials === "include"
-		  ? "credentials-include"
-		  : "none";
+			? "credentials-include"
+			: "none";
 
 	const url = endpointUrl
 		? chatCompletion
 			? endpointUrl + `/v1/chat/completions`
 			: endpointUrl
 		: makeUrl({
-				authMethod,
-				chatCompletion: chatCompletion ?? false,
-				forceTask,
-				model,
-				provider: provider ?? "hf-inference",
-				taskHint,
-		  });
+			authMethod,
+			chatCompletion: chatCompletion ?? false,
+			forceTask,
+			model,
+			provider: provider ?? "hf-inference",
+			taskHint,
+		});
 
 	const headers: Record<string, string> = {};
 	if (accessToken) {
@@ -83,7 +83,7 @@ export async function makeRequestOptions(
 			provider === "fal-ai" && authMethod === "provider-key" ? `Key ${accessToken}` : `Bearer ${accessToken}`;
 	}
 
-	const binary = "data" in args && !!args.data;
+	const binary = "data" in args && !!args.data && args.data instanceof Blob;
 
 	if (!binary) {
 		headers["Content-Type"] = "application/json";
@@ -127,12 +127,12 @@ export async function makeRequestOptions(
 	const info: RequestInit = {
 		headers,
 		method: "POST",
-		body: binary
+		body: "data" in args && args.data instanceof Blob
 			? args.data
 			: JSON.stringify({
-					...otherArgs,
-					...(chatCompletion || provider === "together" ? { model } : undefined),
-			  }),
+				...otherArgs,
+				...(chatCompletion || provider === "together" ? { model } : undefined),
+			}),
 		...(credentials ? { credentials } : undefined),
 		signal: options?.signal,
 	};

From 84605e0ee13dfb68021a59814818200c068be09d Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 19:27:34 +0100
Subject: [PATCH 10/23] format

---
 .../inference/src/lib/makeRequestOptions.ts   | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/packages/inference/src/lib/makeRequestOptions.ts b/packages/inference/src/lib/makeRequestOptions.ts
index 1643ac9c9a..38e5d24af2 100644
--- a/packages/inference/src/lib/makeRequestOptions.ts
+++ b/packages/inference/src/lib/makeRequestOptions.ts
@@ -61,21 +61,21 @@ export async function makeRequestOptions(
 			? "hf-token"
 			: "provider-key"
 		: includeCredentials === "include"
-			? "credentials-include"
-			: "none";
+		  ? "credentials-include"
+		  : "none";
 
 	const url = endpointUrl
 		? chatCompletion
 			? endpointUrl + `/v1/chat/completions`
 			: endpointUrl
 		: makeUrl({
-			authMethod,
-			chatCompletion: chatCompletion ?? false,
-			forceTask,
-			model,
-			provider: provider ?? "hf-inference",
-			taskHint,
-		});
+				authMethod,
+				chatCompletion: chatCompletion ?? false,
+				forceTask,
+				model,
+				provider: provider ?? "hf-inference",
+				taskHint,
+		  });
 
 	const headers: Record<string, string> = {};
 	if (accessToken) {
@@ -127,12 +127,13 @@ export async function makeRequestOptions(
 	const info: RequestInit = {
 		headers,
 		method: "POST",
-		body: "data" in args && args.data instanceof Blob
-			? args.data
-			: JSON.stringify({
-				...otherArgs,
-				...(chatCompletion || provider === "together" ? { model } : undefined),
-			}),
+		body:
+			"data" in args && args.data instanceof Blob
+				? args.data
+				: JSON.stringify({
+						...otherArgs,
+						...(chatCompletion || provider === "together" ? { model } : undefined),
+				  }),
 		...(credentials ? { credentials } : undefined),
 		signal: options?.signal,
 	};

From 36b9cd086cccc1b2d0e833d2ddf78cabcdc9f52e Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 20 Jan 2025 19:31:36 +0100
Subject: [PATCH 11/23] test

---
 packages/inference/src/tasks/audio/textToSpeech.ts | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/packages/inference/src/tasks/audio/textToSpeech.ts b/packages/inference/src/tasks/audio/textToSpeech.ts
index 5886588419..9b46618bc0 100644
--- a/packages/inference/src/tasks/audio/textToSpeech.ts
+++ b/packages/inference/src/tasks/audio/textToSpeech.ts
@@ -17,6 +17,10 @@ export async function textToSpeech(args: TextToSpeechArgs, options?: Options): P
 		...options,
 		taskHint: "text-to-speech",
 	});
+	console.log(res)
+	if (res instanceof Blob) {
+		return { audio: res }
+	}
 	if (res && typeof res === "object") {
 		if ("output" in res) {
 			if (typeof res.output === "string") {
@@ -29,13 +33,6 @@ export async function textToSpeech(args: TextToSpeechArgs, options?: Options): P
 				return { audio: blob };
 			}
 		}
-		throw new InferenceOutputError("Expected Blob or object with output");
-	} else {
-		const isValidOutput = res && res instanceof Blob;
-		if (!isValidOutput) {
-			throw new InferenceOutputError("Expected Blob");
-		}
 	}
-
-	return { audio: res };
+	throw new InferenceOutputError("Expected Blob or object with output");
 }

From 8c2dc24155155b8512092f30948a572fa15cd44e Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 14:52:55 +0100
Subject: [PATCH 12/23] backwards compat for most image & audio tasks

---
 .../src/tasks/audio/audioClassification.ts    |  10 +-
 .../inference/src/tasks/audio/audioToAudio.ts |  10 +-
 .../tasks/audio/automaticSpeechRecognition.ts |  10 +-
 .../inference/src/tasks/audio/textToSpeech.ts |  11 +-
 packages/inference/src/tasks/audio/utils.ts   |  16 +
 .../src/tasks/cv/imageClassification.ts       |   9 +-
 .../src/tasks/cv/imageSegmentation.ts         |   9 +-
 .../inference/src/tasks/cv/imageToImage.ts    |   8 +-
 .../inference/src/tasks/cv/imageToText.ts     |  10 +-
 .../inference/src/tasks/cv/objectDetection.ts |   9 +-
 .../inference/src/tasks/cv/textToImage.ts     |  10 +-
 packages/inference/src/tasks/cv/utils.ts      |  13 +
 .../tasks/cv/zeroShotImageClassification.ts   |  43 +-
 packages/inference/test/HfInference.spec.ts   | 381 ++++++++----------
 packages/inference/test/tapes.json            | 372 ++---------------
 15 files changed, 302 insertions(+), 619 deletions(-)
 create mode 100644 packages/inference/src/tasks/audio/utils.ts
 create mode 100644 packages/inference/src/tasks/cv/utils.ts

diff --git a/packages/inference/src/tasks/audio/audioClassification.ts b/packages/inference/src/tasks/audio/audioClassification.ts
index 91165a6eb3..6756db0b56 100644
--- a/packages/inference/src/tasks/audio/audioClassification.ts
+++ b/packages/inference/src/tasks/audio/audioClassification.ts
@@ -2,9 +2,10 @@ import type { AudioClassificationInput, AudioClassificationOutput } from "@huggi
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
-import { omit } from "../../utils/omit";
+import type { LegacyAudioInput } from "./utils";
+import { preparePayload } from "./utils";
 
-export type AudioClassificationArgs = BaseArgs & AudioClassificationInput;
+export type AudioClassificationArgs = BaseArgs & (AudioClassificationInput | LegacyAudioInput);
 
 /**
  * This task reads some audio input and outputs the likelihood of classes.
@@ -14,10 +15,7 @@ export async function audioClassification(
 	args: AudioClassificationArgs,
 	options?: Options
 ): Promise<AudioClassificationOutput> {
-	const payload = {
-		...omit(args, "inputs"),
-		data: args.inputs,
-	};
+	const payload = preparePayload(args);
 	const res = await request<AudioClassificationOutput>(payload, {
 		...options,
 		taskHint: "audio-classification",
diff --git a/packages/inference/src/tasks/audio/audioToAudio.ts b/packages/inference/src/tasks/audio/audioToAudio.ts
index d80e9d8052..8ae6b15de5 100644
--- a/packages/inference/src/tasks/audio/audioToAudio.ts
+++ b/packages/inference/src/tasks/audio/audioToAudio.ts
@@ -1,14 +1,15 @@
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
-import { omit } from "../../utils/omit";
 import { request } from "../custom/request";
+import type { LegacyAudioInput } from "./utils";
+import { preparePayload } from "./utils";
 
 export type AudioToAudioArgs = BaseArgs & {
 	/**
 	 * Binary audio data
 	 */
 	inputs: Blob;
-};
+} | LegacyAudioInput;
 
 export interface AudioToAudioOutputElem {
 	/**
@@ -35,10 +36,7 @@ interface LegacyOutput {
  * Example model: speechbrain/sepformer-wham does audio source separation.
  */
 export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput> {
-	const payload = {
-		...omit(args, "inputs"),
-		data: args.inputs,
-	};
+	const payload = preparePayload(args);
 	const res = await request<AudioToAudioOutput | LegacyOutput[]>(payload, {
 		...options,
 		taskHint: "audio-to-audio",
diff --git a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
index 658f98e8e8..70eb4df296 100644
--- a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
+++ b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
@@ -3,9 +3,10 @@ import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
-import { omit } from "../../utils/omit";
+import type { LegacyAudioInput } from "./utils";
+import { preparePayload } from "./utils";
 
-export type AutomaticSpeechRecognitionArgs = BaseArgs & AutomaticSpeechRecognitionInput;
+export type AutomaticSpeechRecognitionArgs = BaseArgs & (AutomaticSpeechRecognitionInput | LegacyAudioInput);
 /**
  * This task reads some audio input and outputs the said words within the audio files.
  * Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -14,6 +15,7 @@ export async function automaticSpeechRecognition(
 	args: AutomaticSpeechRecognitionArgs,
 	options?: Options
 ): Promise<AutomaticSpeechRecognitionOutput> {
+	const payload = preparePayload(args);
 	if (args.provider === "fal-ai") {
 		const contentType = args.inputs.type;
 		if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
@@ -27,10 +29,6 @@ export async function automaticSpeechRecognition(
 		(args as AutomaticSpeechRecognitionArgs & { audio_url: string }).audio_url =
 			`data:${contentType};base64,${base64audio}`;
 	}
-	const payload = {
-		...omit(args, "inputs"),
-		...(args.provider !== "fal-ai" ? { data: args.inputs } : undefined),
-	};
 	const res = await request<AutomaticSpeechRecognitionOutput>(payload as AutomaticSpeechRecognitionArgs, {
 		...options,
 		taskHint: "automatic-speech-recognition",
diff --git a/packages/inference/src/tasks/audio/textToSpeech.ts b/packages/inference/src/tasks/audio/textToSpeech.ts
index 9b46618bc0..d981ae7c81 100644
--- a/packages/inference/src/tasks/audio/textToSpeech.ts
+++ b/packages/inference/src/tasks/audio/textToSpeech.ts
@@ -1,4 +1,4 @@
-import type { TextToSpeechInput, TextToSpeechOutput } from "@huggingface/tasks";
+import type { TextToSpeechInput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
@@ -12,25 +12,24 @@ interface OutputUrlTextToSpeechGeneration {
  * This task synthesize an audio of a voice pronouncing a given text.
  * Recommended model: espnet/kan-bayashi_ljspeech_vits
  */
-export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<TextToSpeechOutput> {
+export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<Blob> {
 	const res = await request<Blob | OutputUrlTextToSpeechGeneration>(args, {
 		...options,
 		taskHint: "text-to-speech",
 	});
-	console.log(res)
 	if (res instanceof Blob) {
-		return { audio: res }
+		return res;
 	}
 	if (res && typeof res === "object") {
 		if ("output" in res) {
 			if (typeof res.output === "string") {
 				const urlResponse = await fetch(res.output);
 				const blob = await urlResponse.blob();
-				return { audio: blob };
+				return blob;
 			} else if (Array.isArray(res.output)) {
 				const urlResponse = await fetch(res.output[0]);
 				const blob = await urlResponse.blob();
-				return { audio: blob };
+				return blob;
 			}
 		}
 	}
diff --git a/packages/inference/src/tasks/audio/utils.ts b/packages/inference/src/tasks/audio/utils.ts
new file mode 100644
index 0000000000..58716efd3c
--- /dev/null
+++ b/packages/inference/src/tasks/audio/utils.ts
@@ -0,0 +1,16 @@
+import type { BaseArgs, RequestArgs } from "../../types";
+import { omit } from "../../utils/omit";
+
+/**
+ * @deprecated
+ */
+export interface LegacyAudioInput {
+	data: Blob | ArrayBuffer
+}
+
+export function preparePayload(args: BaseArgs & ({ inputs: Blob } | LegacyAudioInput)): RequestArgs {
+	return "data" in args ? args : {
+		...omit(args, "inputs"),
+		data: args.inputs
+	}
+}
diff --git a/packages/inference/src/tasks/cv/imageClassification.ts b/packages/inference/src/tasks/cv/imageClassification.ts
index c30d3e119b..e3ab3fccb3 100644
--- a/packages/inference/src/tasks/cv/imageClassification.ts
+++ b/packages/inference/src/tasks/cv/imageClassification.ts
@@ -2,9 +2,9 @@ import type { ImageClassificationInput, ImageClassificationOutput } from "@huggi
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
-import { omit } from "../../utils/omit";
+import { preparePayload, type LegacyImageInput } from "./utils";
 
-export type ImageClassificationArgs = BaseArgs & ImageClassificationInput;
+export type ImageClassificationArgs = BaseArgs & (ImageClassificationInput | LegacyImageInput);
 
 /**
  * This task reads some image input and outputs the likelihood of classes.
@@ -14,10 +14,7 @@ export async function imageClassification(
 	args: ImageClassificationArgs,
 	options?: Options
 ): Promise<ImageClassificationOutput> {
-	const payload = {
-		...omit(args, "inputs"),
-		data: args.inputs,
-	};
+	const payload = preparePayload(args);
 	const res = await request<ImageClassificationOutput>(payload, {
 		...options,
 		taskHint: "image-classification",
diff --git a/packages/inference/src/tasks/cv/imageSegmentation.ts b/packages/inference/src/tasks/cv/imageSegmentation.ts
index 8a145d9d11..3e5a8453ee 100644
--- a/packages/inference/src/tasks/cv/imageSegmentation.ts
+++ b/packages/inference/src/tasks/cv/imageSegmentation.ts
@@ -2,9 +2,9 @@ import type { ImageSegmentationInput, ImageSegmentationOutput } from "@huggingfa
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
-import { omit } from "../../utils/omit";
+import { preparePayload, type LegacyImageInput } from "./utils";
 
-export type ImageSegmentationArgs = BaseArgs & ImageSegmentationInput;
+export type ImageSegmentationArgs = BaseArgs & (ImageSegmentationInput | LegacyImageInput);
 
 /**
  * This task reads some image input and outputs the likelihood of classes & bounding boxes of detected objects.
@@ -14,10 +14,7 @@ export async function imageSegmentation(
 	args: ImageSegmentationArgs,
 	options?: Options
 ): Promise<ImageSegmentationOutput> {
-	const payload = {
-		...omit(args, "inputs"),
-		data: args.inputs,
-	};
+	const payload = preparePayload(args);
 	const res = await request<ImageSegmentationOutput>(payload, {
 		...options,
 		taskHint: "image-segmentation",
diff --git a/packages/inference/src/tasks/cv/imageToImage.ts b/packages/inference/src/tasks/cv/imageToImage.ts
index b9bd49c195..7efd0a67b3 100644
--- a/packages/inference/src/tasks/cv/imageToImage.ts
+++ b/packages/inference/src/tasks/cv/imageToImage.ts
@@ -1,4 +1,4 @@
-import type { ImageToImageInput, ImageToImageOutput } from "@huggingface/tasks";
+import type { ImageToImageInput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
@@ -10,7 +10,7 @@ export type ImageToImageArgs = BaseArgs & ImageToImageInput;
  * This task reads some text input and outputs an image.
  * Recommended model: lllyasviel/sd-controlnet-depth
  */
-export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<ImageToImageOutput> {
+export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<Blob> {
 	let reqArgs: RequestArgs;
 	if (!args.parameters) {
 		reqArgs = {
@@ -26,7 +26,7 @@ export async function imageToImage(args: ImageToImageArgs, options?: Options): P
 			),
 		};
 	}
-	const res = await request<ImageToImageOutput>(reqArgs, {
+	const res = await request<Blob>(reqArgs, {
 		...options,
 		taskHint: "image-to-image",
 	});
@@ -34,5 +34,5 @@ export async function imageToImage(args: ImageToImageArgs, options?: Options): P
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Blob");
 	}
-	return { image: res };
+	return res;
 }
diff --git a/packages/inference/src/tasks/cv/imageToText.ts b/packages/inference/src/tasks/cv/imageToText.ts
index 601fda4f8c..e7c952d286 100644
--- a/packages/inference/src/tasks/cv/imageToText.ts
+++ b/packages/inference/src/tasks/cv/imageToText.ts
@@ -2,17 +2,15 @@ import type { ImageToTextInput, ImageToTextOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
-import { omit } from "../../utils/omit";
+import type { LegacyImageInput } from "./utils";
+import { preparePayload } from "./utils";
 
-export type ImageToTextArgs = BaseArgs & ImageToTextInput;
+export type ImageToTextArgs = BaseArgs & (ImageToTextInput | LegacyImageInput);
 /**
  * This task reads some image input and outputs the text caption.
  */
 export async function imageToText(args: ImageToTextArgs, options?: Options): Promise<ImageToTextOutput> {
-	const payload = {
-		...omit(args, "inputs"),
-		data: args.inputs,
-	};
+	const payload = preparePayload(args);
 	const res = (
 		await request<[ImageToTextOutput]>(payload, {
 			...options,
diff --git a/packages/inference/src/tasks/cv/objectDetection.ts b/packages/inference/src/tasks/cv/objectDetection.ts
index 50f883c2e6..509c94c82d 100644
--- a/packages/inference/src/tasks/cv/objectDetection.ts
+++ b/packages/inference/src/tasks/cv/objectDetection.ts
@@ -2,19 +2,16 @@ import { request } from "../custom/request";
 import type { BaseArgs, Options } from "../../types";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { ObjectDetectionInput, ObjectDetectionOutput } from "@huggingface/tasks";
-import { omit } from "../../utils/omit";
+import { preparePayload, type LegacyImageInput } from "./utils";
 
-export type ObjectDetectionArgs = BaseArgs & ObjectDetectionInput;
+export type ObjectDetectionArgs = BaseArgs & (ObjectDetectionInput | LegacyImageInput);
 
 /**
  * This task reads some image input and outputs the likelihood of classes & bounding boxes of detected objects.
  * Recommended model: facebook/detr-resnet-50
  */
 export async function objectDetection(args: ObjectDetectionArgs, options?: Options): Promise<ObjectDetectionOutput> {
-	const payload = {
-		...omit(args, "inputs"),
-		data: args.inputs,
-	};
+	const payload = preparePayload(args);
 	const res = await request<ObjectDetectionOutput>(payload, {
 		...options,
 		taskHint: "object-detection",
diff --git a/packages/inference/src/tasks/cv/textToImage.ts b/packages/inference/src/tasks/cv/textToImage.ts
index ae740fb626..495c099352 100644
--- a/packages/inference/src/tasks/cv/textToImage.ts
+++ b/packages/inference/src/tasks/cv/textToImage.ts
@@ -18,7 +18,7 @@ interface OutputUrlImageGeneration {
  * This task reads some text input and outputs an image.
  * Recommended model: stabilityai/stable-diffusion-2
  */
-export async function textToImage(args: TextToImageArgs, options?: Options): Promise<TextToImageOutput> {
+export async function textToImage(args: TextToImageArgs, options?: Options): Promise<Blob> {
 	if (args.provider === "together" || args.provider === "fal-ai") {
 		args.prompt = args.inputs;
 		delete (args as unknown as { inputs: unknown }).inputs;
@@ -34,23 +34,23 @@ export async function textToImage(args: TextToImageArgs, options?: Options): Pro
 	if (res && typeof res === "object") {
 		if (args.provider === "fal-ai" && "images" in res && Array.isArray(res.images) && res.images[0].url) {
 			const image = await fetch(res.images[0].url);
-			return { image: await image.blob() };
+			return await image.blob();
 		}
 		if ("data" in res && Array.isArray(res.data) && res.data[0].b64_json) {
 			const base64Data = res.data[0].b64_json;
 			const base64Response = await fetch(`data:image/jpeg;base64,${base64Data}`);
 			const blob = await base64Response.blob();
-			return { image: blob };
+			return blob;
 		}
 		if ("output" in res && Array.isArray(res.output)) {
 			const urlResponse = await fetch(res.output[0]);
 			const blob = await urlResponse.blob();
-			return { image: blob };
+			return blob;
 		}
 	}
 	const isValidOutput = res && res instanceof Blob;
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Blob");
 	}
-	return { image: res };
+	return res;
 }
diff --git a/packages/inference/src/tasks/cv/utils.ts b/packages/inference/src/tasks/cv/utils.ts
new file mode 100644
index 0000000000..cab7f2a41c
--- /dev/null
+++ b/packages/inference/src/tasks/cv/utils.ts
@@ -0,0 +1,13 @@
+import type { BaseArgs, RequestArgs } from "../../types";
+import { omit } from "../../utils/omit";
+
+/**
+ * @deprecated
+ */
+export interface LegacyImageInput {
+	data: Blob | ArrayBuffer;
+}
+
+export function preparePayload(args: BaseArgs & ({ inputs: Blob } | LegacyImageInput)): RequestArgs {
+	return "data" in args ? args : { ...omit(args, "inputs"), data: args.inputs };
+}
diff --git a/packages/inference/src/tasks/cv/zeroShotImageClassification.ts b/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
index 014cbe8126..3ee4e0d9e0 100644
--- a/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
+++ b/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
@@ -5,7 +5,38 @@ import type { RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import type { ZeroShotImageClassificationInput, ZeroShotImageClassificationOutput } from "@huggingface/tasks";
 
-export type ZeroShotImageClassificationArgs = BaseArgs & ZeroShotImageClassificationInput;
+/**
+ * @deprecated
+ */
+interface LegacyZeroShotImageClassificationInput {
+	inputs: { image: Blob | ArrayBuffer }
+}
+
+export type ZeroShotImageClassificationArgs = BaseArgs & (ZeroShotImageClassificationInput | LegacyZeroShotImageClassificationInput);
+
+async function preparePayload(args: ZeroShotImageClassificationArgs): Promise<RequestArgs> {
+	if (args.inputs instanceof Blob) {
+		return {
+			...args,
+			inputs: {
+				image: base64FromBytes(
+					new Uint8Array(await args.inputs.arrayBuffer())
+				)
+			}
+		}
+	} else {
+		return {
+			...args,
+			inputs: {
+				image: base64FromBytes(
+					new Uint8Array(
+						args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
+					)
+				)
+			},
+		}
+	}
+}
 
 /**
  * Classify an image to specified classes.
@@ -15,14 +46,8 @@ export async function zeroShotImageClassification(
 	args: ZeroShotImageClassificationArgs,
 	options?: Options
 ): Promise<ZeroShotImageClassificationOutput> {
-	const reqArgs: RequestArgs = {
-		...args,
-		inputs: {
-			image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer())),
-		},
-	} as RequestArgs;
-
-	const res = await request<ZeroShotImageClassificationOutput>(reqArgs, {
+	const payload = await preparePayload(args)
+	const res = await request<ZeroShotImageClassificationOutput>(payload, {
 		...options,
 		taskHint: "zero-shot-image-classification",
 	});
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index 9aee7c7811..adc498451b 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -1,39 +1,8 @@
 import { expect, it, describe, assert } from "vitest";
 
-import type { ChatCompletionStreamOutput, VisualQuestionAnsweringInput } from "@huggingface/tasks";
-
-import {
-	audioClassification,
-	audioToAudio,
-	automaticSpeechRecognition,
-	chatCompletion,
-	chatCompletionStream,
-	documentQuestionAnswering,
-	featureExtraction,
-	fillMask,
-	HfInference,
-	imageClassification,
-	imageToImage,
-	imageToText,
-	objectDetection,
-	questionAnswering,
-	request,
-	sentenceSimilarity,
-	summarization,
-	tableQuestionAnswering,
-	tabularClassification,
-	tabularRegression,
-	textClassification,
-	textGeneration,
-	textGenerationStream,
-	textToImage,
-	textToSpeech,
-	tokenClassification,
-	translation,
-	visualQuestionAnswering,
-	zeroShotClassification,
-	zeroShotImageClassification,
-} from "../src";
+import type { ChatCompletionStreamOutput } from "@huggingface/tasks";
+
+import { chatCompletion, HfInference } from "../src";
 import "./vcr";
 import { readTestFile } from "./test-files";
 
@@ -50,22 +19,21 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"HF Inference",
 		() => {
+			const hf = new HfInference(env.HF_TOKEN);
 			it("throws error if model does not exist", () => {
 				expect(
-					fillMask({
+					hf.fillMask({
 						model: "this-model-does-not-exist-123",
 						inputs: "[MASK] world!",
-						accessToken: env.HF_TOKEN,
 					})
 				).rejects.toThrowError("Model this-model-does-not-exist-123 does not exist");
 			});
 
 			it("fillMask", async () => {
 				expect(
-					await fillMask({
+					await hf.fillMask({
 						model: "bert-base-uncased",
 						inputs: "[MASK] world!",
-						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -81,9 +49,8 @@ describe.concurrent("HfInference", () => {
 
 			it("works without model", async () => {
 				expect(
-					await fillMask({
+					await hf.fillMask({
 						inputs: "[MASK] world!",
-						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -99,14 +66,13 @@ describe.concurrent("HfInference", () => {
 
 			it("summarization", async () => {
 				expect(
-					await summarization({
+					await hf.summarization({
 						model: "google/pegasus-xsum",
 						inputs:
 							"The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930.",
 						parameters: {
 							max_length: 100,
 						},
-						accessToken: env.HF_TOKEN,
 					})
 				).toEqual({
 					summary_text: "The Eiffel Tower is one of the most famous buildings in the world.",
@@ -114,30 +80,28 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("questionAnswering", async () => {
-				const res = await questionAnswering({
-					model: "deepset/roberta-base-squad2",
-					inputs: {
-						question: "What is the capital of France?",
-						context: "The capital of France is Paris.",
-					},
-					accessToken: env.HF_TOKEN,
+				expect(
+					await hf.questionAnswering({
+						model: "deepset/roberta-base-squad2",
+						inputs: {
+							question: "What is the capital of France?",
+							context: "The capital of France is Paris.",
+						},
+					})
+				).toMatchObject({
+					answer: "Paris",
+					score: expect.any(Number),
+					start: expect.any(Number),
+					end: expect.any(Number),
 				});
-				expect(res).toMatchObject([
-					{
-						answer: "Paris",
-						score: expect.any(Number),
-						start: expect.any(Number),
-						end: expect.any(Number),
-					},
-				]);
 			});
 
 			it("tableQuestionAnswering", async () => {
 				expect(
-					await tableQuestionAnswering({
+					await hf.tableQuestionAnswering({
 						model: "google/tapas-base-finetuned-wtq",
 						inputs: {
-							question: "How many stars does the transformers repository have?",
+							query: "How many stars does the transformers repository have?",
 							table: {
 								Repository: ["Transformers", "Datasets", "Tokenizers"],
 								Stars: ["36542", "4512", "3934"],
@@ -145,46 +109,42 @@ describe.concurrent("HfInference", () => {
 								"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
 							},
 						},
-						accessToken: env.HF_TOKEN,
 					})
-				).toMatchObject([
-					{
-						answer: "AVERAGE > 36542",
-						coordinates: [[0, 1]],
-						cells: ["36542"],
-						aggregator: "AVERAGE",
-					},
-				]);
+				).toMatchObject({
+					answer: "AVERAGE > 36542",
+					coordinates: [[0, 1]],
+					cells: ["36542"],
+					aggregator: "AVERAGE",
+				});
 			});
 
 			it("documentQuestionAnswering", async () => {
-				const res = await documentQuestionAnswering({
-					model: "impira/layoutlm-document-qa",
-					inputs: {
-						question: "Invoice number?",
-						image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
-					},
-					accessToken: env.HF_TOKEN,
+				expect(
+					await hf.documentQuestionAnswering({
+						model: "impira/layoutlm-document-qa",
+						inputs: {
+							question: "Invoice number?",
+							image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
+						},
+					})
+				).toMatchObject({
+					answer: "us-001",
+					score: expect.any(Number),
+					// not sure what start/end refers to in this case
+					start: expect.any(Number),
+					end: expect.any(Number),
 				});
-				expect(res).toBeInstanceOf(Array);
-				for (const elem of res) {
-					expect(elem).toMatchObject({
-						answer: expect.any(String),
-						score: expect.any(Number),
-					});
-				}
 			});
 
 			// Errors with "Error: If you are using a VisionEncoderDecoderModel, you must provide a feature extractor"
 			it.skip("documentQuestionAnswering with non-array output", async () => {
 				expect(
-					await documentQuestionAnswering({
+					await hf.documentQuestionAnswering({
 						model: "naver-clova-ix/donut-base-finetuned-docvqa",
 						inputs: {
 							question: "Invoice number?",
 							image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
 						},
-						accessToken: env.HF_TOKEN,
 					})
 				).toMatchObject({
 					answer: "us-001",
@@ -192,29 +152,25 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("visualQuestionAnswering", async () => {
-				const res = await visualQuestionAnswering({
-					model: "dandelin/vilt-b32-finetuned-vqa",
-					inputs: {
-						question: "How many cats are lying down?",
-						image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
-					},
-					accessToken: env.HF_TOKEN,
-				} satisfies VisualQuestionAnsweringInput);
-				expect(res).toBeInstanceOf(Array);
-				for (const elem of res) {
-					expect(elem).toMatchObject({
-						answer: expect.any(String),
-						score: expect.any(Number),
-					});
-				}
+				expect(
+					await hf.visualQuestionAnswering({
+						model: "dandelin/vilt-b32-finetuned-vqa",
+						inputs: {
+							question: "How many cats are lying down?",
+							image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
+						},
+					})
+				).toMatchObject({
+					answer: "2",
+					score: expect.any(Number),
+				});
 			});
 
 			it("textClassification", async () => {
 				expect(
-					await textClassification({
+					await hf.textClassification({
 						model: "distilbert-base-uncased-finetuned-sst-2-english",
 						inputs: "I like you. I love you.",
-						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -228,8 +184,7 @@ describe.concurrent("HfInference", () => {
 
 			it("textGeneration - gpt2", async () => {
 				expect(
-					await textGeneration({
-						accessToken: env.HF_TOKEN,
+					await hf.textGeneration({
 						model: "gpt2",
 						inputs: "The answer to the universe is",
 					})
@@ -240,8 +195,7 @@ describe.concurrent("HfInference", () => {
 
 			it("textGeneration - openai-community/gpt2", async () => {
 				expect(
-					await textGeneration({
-						accessToken: env.HF_TOKEN,
+					await hf.textGeneration({
 						model: "openai-community/gpt2",
 						inputs: "The answer to the universe is",
 					})
@@ -251,8 +205,7 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("textGenerationStream - meta-llama/Llama-3.2-3B", async () => {
-				const response = textGenerationStream({
-					accessToken: env.HF_TOKEN,
+				const response = hf.textGenerationStream({
 					model: "meta-llama/Llama-3.2-3B",
 					inputs: "Please answer the following question: complete one two and ____.",
 					parameters: {
@@ -279,8 +232,7 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("textGenerationStream - catch error", async () => {
-				const response = textGenerationStream({
-					accessToken: env.HF_TOKEN,
+				const response = hf.textGenerationStream({
 					model: "meta-llama/Llama-3.2-3B",
 					inputs: "Write a short story about a robot that becomes sentient and takes over the world.",
 					parameters: {
@@ -295,9 +247,8 @@ describe.concurrent("HfInference", () => {
 
 			it.skip("textGenerationStream - Abort", async () => {
 				const controller = new AbortController();
-				const response = textGenerationStream(
+				const response = hf.textGenerationStream(
 					{
-						accessToken: env.HF_TOKEN,
 						model: "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
 						inputs: "Write an essay about Sartre's philosophy.",
 						parameters: {
@@ -314,8 +265,7 @@ describe.concurrent("HfInference", () => {
 
 			it("tokenClassification", async () => {
 				expect(
-					await tokenClassification({
-						accessToken: env.HF_TOKEN,
+					await hf.tokenClassification({
 						model: "dbmdz/bert-large-cased-finetuned-conll03-english",
 						inputs: "My name is Sarah Jessica Parker but you can call me Jessica",
 					})
@@ -334,23 +284,36 @@ describe.concurrent("HfInference", () => {
 
 			it("translation", async () => {
 				expect(
-					await translation({
-						accessToken: env.HF_TOKEN,
+					await hf.translation({
 						model: "t5-base",
 						inputs: "My name is Wolfgang and I live in Berlin",
 					})
 				).toMatchObject({
 					translation_text: "Mein Name ist Wolfgang und ich lebe in Berlin",
 				});
+				// input is a list
+				expect(
+					await hf.translation({
+						model: "t5-base",
+						inputs: ["My name is Wolfgang and I live in Berlin", "I work as programmer"],
+					})
+				).toMatchObject([
+					{
+						translation_text: "Mein Name ist Wolfgang und ich lebe in Berlin",
+					},
+					{
+						translation_text: "Ich arbeite als Programmierer",
+					},
+				]);
 			});
 			it("zeroShotClassification", async () => {
 				expect(
-					await zeroShotClassification({
+					await hf.zeroShotClassification({
 						model: "facebook/bart-large-mnli",
-						inputs:
+						inputs: [
 							"Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!",
+						],
 						parameters: { candidate_labels: ["refund", "legal", "faq"] },
-						accessToken: env.HF_TOKEN,
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -367,29 +330,26 @@ describe.concurrent("HfInference", () => {
 					])
 				);
 			});
-			it("sentenceSimilarity", async () => {
+			it("SentenceSimilarity", async () => {
 				expect(
-					await sentenceSimilarity({
-						accessToken: env.HF_TOKEN,
+					await hf.sentenceSimilarity({
 						model: "sentence-transformers/paraphrase-xlm-r-multilingual-v1",
 						inputs: {
-							sourceSentence: "That is a happy person",
+							source_sentence: "That is a happy person",
 							sentences: ["That is a happy dog", "That is a very happy person", "Today is a sunny day"],
 						},
 					})
 				).toEqual([expect.any(Number), expect.any(Number), expect.any(Number)]);
 			});
 			it("FeatureExtraction", async () => {
-				const response = await featureExtraction({
-					accessToken: env.HF_TOKEN,
+				const response = await hf.featureExtraction({
 					model: "sentence-transformers/distilbert-base-nli-mean-tokens",
 					inputs: "That is a happy person",
 				});
 				expect(response).toEqual(expect.arrayContaining([expect.any(Number)]));
 			});
 			it("FeatureExtraction - same model as sentence similarity", async () => {
-				const response = await featureExtraction({
-					accessToken: env.HF_TOKEN,
+				const response = await hf.featureExtraction({
 					model: "sentence-transformers/paraphrase-xlm-r-multilingual-v1",
 					inputs: "That is a happy person",
 				});
@@ -398,8 +358,7 @@ describe.concurrent("HfInference", () => {
 				expect(response).toEqual(expect.arrayContaining([expect.any(Number)]));
 			});
 			it("FeatureExtraction - facebook/bart-base", async () => {
-				const response = await featureExtraction({
-					accessToken: env.HF_TOKEN,
+				const response = await hf.featureExtraction({
 					model: "facebook/bart-base",
 					inputs: "That is a happy person",
 				});
@@ -417,8 +376,7 @@ describe.concurrent("HfInference", () => {
 				]);
 			});
 			it("FeatureExtraction - facebook/bart-base, list input", async () => {
-				const response = await featureExtraction({
-					accessToken: env.HF_TOKEN,
+				const response = await hf.featureExtraction({
 					model: "facebook/bart-base",
 					inputs: ["hello", "That is a happy person"],
 				});
@@ -446,10 +404,9 @@ describe.concurrent("HfInference", () => {
 			});
 			it("automaticSpeechRecognition", async () => {
 				expect(
-					await automaticSpeechRecognition({
-						accessToken: env.HF_TOKEN,
+					await hf.automaticSpeechRecognition({
 						model: "facebook/wav2vec2-large-960h-lv60-self",
-						inputs: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
+						data: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
 					})
 				).toMatchObject({
 					text: "GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES IN DRAUGHTY SCHOOLROOMS DAY AFTER DAY FOR A FORTNIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS",
@@ -457,10 +414,9 @@ describe.concurrent("HfInference", () => {
 			});
 			it("audioClassification", async () => {
 				expect(
-					await audioClassification({
+					await hf.audioClassification({
 						model: "superb/hubert-large-superb-er",
-						inputs: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
-						accessToken: env.HF_TOKEN,
+						data: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -474,10 +430,9 @@ describe.concurrent("HfInference", () => {
 
 			it("audioToAudio", async () => {
 				expect(
-					await audioToAudio({
+					await hf.audioToAudio({
 						model: "speechbrain/sepformer-wham",
-						accessToken: env.HF_TOKEN,
-						inputs: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
+						data: new Blob([readTestFile("sample1.flac")], { type: "audio/flac" }),
 					})
 				).toEqual(
 					expect.arrayContaining([
@@ -495,21 +450,17 @@ describe.concurrent("HfInference", () => {
 
 			it("textToSpeech", async () => {
 				expect(
-					await textToSpeech({
-						accessToken: env.HF_TOKEN,
+					await hf.textToSpeech({
 						model: "espnet/kan-bayashi_ljspeech_vits",
 						inputs: "hello there!",
 					})
-				).toMatchObject({
-					audio: expect.any(Blob),
-				});
+				).toBeInstanceOf(Blob);
 			});
 
 			it("imageClassification", async () => {
 				expect(
-					await imageClassification({
-						accessToken: env.HF_TOKEN,
-						inputs: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
+					await hf.imageClassification({
+						data: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
 						model: "google/vit-base-patch16-224",
 					})
 				).toEqual(
@@ -524,9 +475,8 @@ describe.concurrent("HfInference", () => {
 
 			it("zeroShotImageClassification", async () => {
 				expect(
-					await zeroShotImageClassification({
-						accessToken: env.HF_TOKEN,
-						inputs: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
+					await hf.zeroShotImageClassification({
+						inputs: { image: new Blob([readTestFile("cheetah.png")], { type: "image/png" }) },
 						model: "openai/clip-vit-large-patch14-336",
 						parameters: {
 							candidate_labels: ["animal", "toy", "car"],
@@ -550,9 +500,8 @@ describe.concurrent("HfInference", () => {
 
 			it("objectDetection", async () => {
 				expect(
-					await objectDetection({
-						accessToken: env.HF_TOKEN,
-						inputs: new Blob([readTestFile("cats.png")], { type: "image/png" }),
+					await hf.imageClassification({
+						data: new Blob([readTestFile("cats.png")], { type: "image/png" }),
 						model: "facebook/detr-resnet-50",
 					})
 				).toEqual(
@@ -572,9 +521,8 @@ describe.concurrent("HfInference", () => {
 			});
 			it("imageSegmentation", async () => {
 				expect(
-					await imageClassification({
-						accessToken: env.HF_TOKEN,
-						inputs: new Blob([readTestFile("cats.png")], { type: "image/png" }),
+					await hf.imageClassification({
+						data: new Blob([readTestFile("cats.png")], { type: "image/png" }),
 						model: "facebook/detr-resnet-50-panoptic",
 					})
 				).toEqual(
@@ -590,33 +538,30 @@ describe.concurrent("HfInference", () => {
 			it("imageToImage", async () => {
 				const num_inference_steps = 25;
 
-				const res = await imageToImage({
-					accessToken: env.HF_TOKEN,
-					inputs: new Blob([readTestFile("stormtrooper_depth.png")], { type: "image/png" }),
+				const res = await hf.imageToImage({
+					inputs: new Blob([readTestFile("stormtrooper_depth.png")], { type: "image / png" }),
 					parameters: {
 						prompt: "elmo's lecture",
 						num_inference_steps,
 					},
 					model: "lllyasviel/sd-controlnet-depth",
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 			it("imageToImage blob data", async () => {
-				const res = await imageToImage({
-					accessToken: env.HF_TOKEN,
+				const res = await hf.imageToImage({
 					inputs: new Blob([readTestFile("bird_canny.png")], { type: "image / png" }),
 					model: "lllyasviel/sd-controlnet-canny",
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 			it("textToImage", async () => {
-				const res = await textToImage({
-					accessToken: env.HF_TOKEN,
+				const res = await hf.textToImage({
 					inputs:
 						"award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]",
 					model: "stabilityai/stable-diffusion-2",
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 
 			it("textToImage with parameters", async () => {
@@ -624,8 +569,7 @@ describe.concurrent("HfInference", () => {
 				const height = 128;
 				const num_inference_steps = 10;
 
-				const res = await textToImage({
-					accessToken: env.HF_TOKEN,
+				const res = await hf.textToImage({
 					inputs:
 						"award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]",
 					model: "stabilityai/stable-diffusion-2",
@@ -636,13 +580,12 @@ describe.concurrent("HfInference", () => {
 						num_inference_steps,
 					},
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 			it("imageToText", async () => {
 				expect(
-					await imageToText({
-						accessToken: env.HF_TOKEN,
-						inputs: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
+					await hf.imageToText({
+						data: new Blob([readTestFile("cheetah.png")], { type: "image/png" }),
 						model: "nlpconnect/vit-gpt2-image-captioning",
 					})
 				).toEqual({
@@ -651,8 +594,7 @@ describe.concurrent("HfInference", () => {
 			});
 			it("request - openai-community/gpt2", async () => {
 				expect(
-					await request({
-						accessToken: env.HF_TOKEN,
+					await hf.request({
 						model: "openai-community/gpt2",
 						inputs: "one plus two equals",
 					})
@@ -666,8 +608,7 @@ describe.concurrent("HfInference", () => {
 			// Skipped at the moment because takes forever
 			it.skip("tabularRegression", async () => {
 				expect(
-					await tabularRegression({
-						accessToken: env.HF_TOKEN,
+					await hf.tabularRegression({
 						model: "scikit-learn/Fish-Weight",
 						inputs: {
 							data: {
@@ -686,8 +627,7 @@ describe.concurrent("HfInference", () => {
 			// Skipped at the moment because takes forever
 			it.skip("tabularClassification", async () => {
 				expect(
-					await tabularClassification({
-						accessToken: env.HF_TOKEN,
+					await hf.tabularClassification({
 						model: "vvmnnnkv/wine-quality",
 						inputs: {
 							data: {
@@ -708,19 +648,16 @@ describe.concurrent("HfInference", () => {
 				).toMatchObject([5, 5, 7]);
 			});
 
-			const hf = new HfInference();
 			it("endpoint - makes request to specified endpoint", async () => {
 				const ep = hf.endpoint("https://api-inference.huggingface.co/models/openai-community/gpt2");
 				const { generated_text } = await ep.textGeneration({
 					inputs: "one plus two equals",
-					accessToken: env.HF_TOKEN,
 				});
 				assert.include(generated_text, "three");
 			});
 
 			it("chatCompletion modelId - OpenAI Specs", async () => {
-				const res = await chatCompletion({
-					accessToken: env.HF_TOKEN,
+				const res = await hf.chatCompletion({
 					model: "mistralai/Mistral-7B-Instruct-v0.2",
 					messages: [{ role: "user", content: "Complete the this sentence with words one plus one is equal " }],
 					max_tokens: 500,
@@ -734,8 +671,7 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("chatCompletionStream modelId - OpenAI Specs", async () => {
-				const stream = chatCompletionStream({
-					accessToken: env.HF_TOKEN,
+				const stream = hf.chatCompletionStream({
 					model: "mistralai/Mistral-7B-Instruct-v0.2",
 					messages: [{ role: "user", content: "Complete the equation 1+1= ,just the answer" }],
 					max_tokens: 500,
@@ -753,14 +689,15 @@ describe.concurrent("HfInference", () => {
 
 			it("chatCompletionStream modelId Fail - OpenAI Specs", async () => {
 				expect(
-					chatCompletionStream({
-						accessToken: env.HF_TOKEN,
-						model: "google/gemma-2b",
-						messages: [{ role: "user", content: "Complete the equation 1+1= ,just the answer" }],
-						max_tokens: 500,
-						temperature: 0.1,
-						seed: 0,
-					}).next()
+					hf
+						.chatCompletionStream({
+							model: "google/gemma-2b",
+							messages: [{ role: "user", content: "Complete the equation 1+1= ,just the answer" }],
+							max_tokens: 500,
+							temperature: 0.1,
+							seed: 0,
+						})
+						.next()
 				).rejects.toThrowError(
 					"Server google/gemma-2b does not seem to support chat completion. Error: Template error: template not found"
 				);
@@ -839,22 +776,22 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"Fal AI",
 		() => {
+			const client = new HfInference(env.HF_FAL_KEY);
+
 			it("textToImage", async () => {
-				const res = await textToImage({
-					accessToken: env.HF_FAL_KEY,
+				const res = await client.textToImage({
 					model: "black-forest-labs/FLUX.1-schnell",
 					provider: "fal-ai",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 
 			it("speechToText", async () => {
-				const res = await automaticSpeechRecognition({
-					accessToken: env.HF_FAL_KEY,
+				const res = await client.automaticSpeechRecognition({
 					model: "openai/whisper-large-v3",
 					provider: "fal-ai",
-					inputs: new Blob([readTestFile("sample2.wav")], { type: "audio/x-wav" }),
+					data: new Blob([readTestFile("sample2.wav")], { type: "audio/x-wav" }),
 				});
 				expect(res).toMatchObject({
 					text: " he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca",
@@ -867,45 +804,43 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"Replicate",
 		() => {
+			const client = new HfInference(env.HF_REPLICATE_KEY);
+
 			it("textToImage canonical", async () => {
-				const res = await textToImage({
-					accessToken: env.HF_REPLICATE_KEY,
+				const res = await client.textToImage({
 					model: "black-forest-labs/FLUX.1-schnell",
 					provider: "replicate",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 
 			it("textToImage versioned", async () => {
-				const res = await textToImage({
-					accessToken: env.HF_REPLICATE_KEY,
+				const res = await client.textToImage({
 					model: "ByteDance/SDXL-Lightning",
 					provider: "replicate",
 					inputs: "black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot",
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 
 			it.skip("textToSpeech versioned", async () => {
-				const res = await textToSpeech({
-					accessToken: env.HF_REPLICATE_KEY,
+				const res = await client.textToSpeech({
 					model: "SWivid/F5-TTS",
 					provider: "replicate",
 					inputs: "Hello, how are you?",
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 
-			it("textToSpeech OuteTTS (versioned)", async () => {
-				const res = await textToSpeech({
-					accessToken: env.HF_REPLICATE_KEY,
+			it("textToSpeech OuteTTS", async () => {
+				const res = await client.textToSpeech({
 					model: "OuteAI/OuteTTS-0.3-500M",
 					provider: "replicate",
 					inputs: "OuteTTS is a frontier TTS model for its size of 1 Billion parameters",
 				});
 
-				expect(res).toMatchObject({ audio: expect.any(Blob) });
+				expect(res).toBeInstanceOf(Blob);
 			});
 		},
 		TIMEOUT
@@ -913,9 +848,10 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"SambaNova",
 		() => {
+			const client = new HfInference(env.HF_SAMBANOVA_KEY);
+
 			it("chatCompletion", async () => {
-				const res = await chatCompletion({
-					accessToken: env.HF_SAMBANOVA_KEY,
+				const res = await client.chatCompletion({
 					model: "meta-llama/Llama-3.1-8B-Instruct",
 					provider: "sambanova",
 					messages: [{ role: "user", content: "Complete this sentence with words, one plus one is equal " }],
@@ -926,8 +862,7 @@ describe.concurrent("HfInference", () => {
 				}
 			});
 			it("chatCompletion stream", async () => {
-				const stream = chatCompletionStream({
-					accessToken: env.HF_SAMBANOVA_KEY,
+				const stream = client.chatCompletionStream({
 					model: "meta-llama/Llama-3.1-8B-Instruct",
 					provider: "sambanova",
 					messages: [{ role: "user", content: "Complete the equation 1 + 1 = , just the answer" }],
@@ -947,9 +882,10 @@ describe.concurrent("HfInference", () => {
 	describe.concurrent(
 		"Together",
 		() => {
+			const client = new HfInference(env.HF_TOGETHER_KEY);
+
 			it("chatCompletion", async () => {
-				const res = await chatCompletion({
-					accessToken: env.HF_TOGETHER_KEY,
+				const res = await client.chatCompletion({
 					model: "meta-llama/Llama-3.3-70B-Instruct",
 					provider: "together",
 					messages: [{ role: "user", content: "Complete this sentence with words, one plus one is equal " }],
@@ -961,8 +897,7 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("chatCompletion stream", async () => {
-				const stream = chatCompletionStream({
-					accessToken: env.HF_TOGETHER_KEY,
+				const stream = client.chatCompletionStream({
 					model: "meta-llama/Llama-3.3-70B-Instruct",
 					provider: "together",
 					messages: [{ role: "user", content: "Complete the equation 1 + 1 = , just the answer" }],
@@ -977,18 +912,16 @@ describe.concurrent("HfInference", () => {
 			});
 
 			it("textToImage", async () => {
-				const res = await textToImage({
+				const res = await client.textToImage({
 					model: "stabilityai/stable-diffusion-xl-base-1.0",
-					accessToken: env.HF_TOGETHER_KEY,
 					provider: "together",
 					inputs: "award winning high resolution photo of a giant tortoise",
 				});
-				expect(res).toSatisfy((out) => typeof out === "object" && !!out && "image" in out && out.image instanceof Blob);
+				expect(res).toBeInstanceOf(Blob);
 			});
 
 			it("textGeneration", async () => {
-				const res = await textGeneration({
-					accessToken: env.HF_TOGETHER_KEY,
+				const res = await client.textGeneration({
 					model: "mistralai/Mixtral-8x7B-v0.1",
 					provider: "together",
 					inputs: "Paris is",
diff --git a/packages/inference/test/tapes.json b/packages/inference/test/tapes.json
index 8cf0b9cf61..2c2454a836 100644
--- a/packages/inference/test/tapes.json
+++ b/packages/inference/test/tapes.json
@@ -4284,289 +4284,56 @@
       }
     }
   },
-  "4cb2d48bab5adad32d2389ad9e40f94aa14a2fab6f68af31a662697e1073afcf": {
-    "url": "https://api-inference.huggingface.co/models/facebook/bart-large-mnli",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":\"Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!\",\"parameters\":{\"candidate_labels\":[\"refund\",\"legal\",\"faq\"]}}"
-    },
-    "response": {
-      "body": "{\"sequence\":\"Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!\",\"labels\":[\"refund\",\"faq\",\"legal\"],\"scores\":[0.8777874112129211,0.10522667318582535,0.01698591560125351]}",
-      "status": 200,
-      "statusText": "OK",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "access-control-expose-headers": "x-compute-type, x-compute-time",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "407c9eae19e4763e292d1bf6f3b67ac76c0042091fbe85758d526d08f8c55eb1": {
-    "url": "https://api-inference.huggingface.co/models/google/vit-base-patch16-224",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{}}"
-    },
-    "response": {
-      "body": "{\"error\":[\"Error in `inputs`: Invalid image: {}\"]}",
-      "status": 400,
-      "statusText": "Bad Request",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "63a14c4db8cb2c34ed3fabd4d65eaa9657e844368fea8990b4cbeee198db2133": {
-    "url": "https://api-inference.huggingface.co/models/facebook/detr-resnet-50",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json",
-        "X-Wait-For-Model": "true"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{}}"
-    },
-    "response": {
-      "body": "{\"error\":\"Please log in or use a HF access token\"}",
-      "status": 429,
-      "statusText": "Too Many Requests",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "4b27a9c8870792397aee3df4876be9ebc507c3a2fc9e828e13bb81c40bf7cd84": {
-    "url": "https://api-inference.huggingface.co/models/speechbrain/sepformer-wham",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json",
-        "X-Wait-For-Model": "true"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{}}"
-    },
-    "response": {
-      "body": "{\"error\":\"Malformed soundfile\"}",
-      "status": 400,
-      "statusText": "Bad Request",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "0388b6333e7c6df163ac27e69352400261ee87ee8a692fe80d0f8896698d6e7c": {
-    "url": "https://api-inference.huggingface.co/models/facebook/detr-resnet-50-panoptic",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json",
-        "X-Wait-For-Model": "true"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{}}"
-    },
-    "response": {
-      "body": "{\"error\":[\"Error in `inputs`: Invalid image: {}\"]}",
-      "status": 400,
-      "statusText": "Bad Request",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "97b2e862f687bc824468e7a93e9f87b1b8a995642d3f6f5769c06e6ccd6f5b4c": {
-    "url": "https://api-inference.huggingface.co/models/nlpconnect/vit-gpt2-image-captioning",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{}}"
-    },
-    "response": {
-      "body": "{\"error\":[\"Error in `inputs`: Invalid image: {}\"]}",
-      "status": 400,
-      "statusText": "Bad Request",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "1d5c927a794ec10ea1fc7fb4e729047cf0282799c0a434bce9d929660488a0f2": {
-    "url": "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":\"award winning high resolution photo of a giant tortoise/((ladybird)) hybrid, [trending on artstation]\",\"parameters\":{\"negative_prompt\":[\"blurry\"],\"width\":512,\"height\":128,\"num_inference_steps\":10}}"
-    },
-    "response": {
-      "body": "{\"error\":\"`negative_prompt` should be the same type to `prompt`, but got <class 'list'> != <class 'str'>.\"}",
-      "status": 400,
-      "statusText": "Bad Request",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "access-control-expose-headers": "x-compute-type, x-compute-time",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "9f068aceef2b0ad23973913eb1a18782be87daf7e8345c57fb68a7eb575de20e": {
-    "url": "https://api-inference.huggingface.co/models/facebook/wav2vec2-large-960h-lv60-self",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json",
-        "X-Wait-For-Model": "true"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{}}"
-    },
-    "response": {
-      "body": "{\"error\":[\"Error in `inputs`: Malformed soundfile\"]}",
-      "status": 400,
-      "statusText": "Bad Request",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "201e762da947b645abe67e466af800afb936ec116a51a03e9b89343faaf7bc01": {
-    "url": "https://api-inference.huggingface.co/models/superb/hubert-large-superb-er",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json",
-        "X-Wait-For-Model": "true"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{}}"
-    },
-    "response": {
-      "body": "{\"error\":[\"Error in `inputs`: Malformed soundfile\"]}",
-      "status": 400,
-      "statusText": "Bad Request",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "41d6b86bf2438eb25338b4572ad63cc23319154c9e44c23f53a079fbc48862f2": {
-    "url": "https://api-inference.huggingface.co/models/sentence-transformers/paraphrase-xlm-r-multilingual-v1",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json",
-        "X-Wait-For-Model": "true"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{\"sourceSentence\":\"That is a happy person\",\"sentences\":[\"That is a happy dog\",\"That is a very happy person\",\"Today is a sunny day\"]}}"
-    },
-    "response": {
-      "body": "{\"error\":[\"Field required: received `source_sentence` in `parameters`\"]}",
-      "status": 400,
-      "statusText": "Bad Request",
-      "headers": {
-        "access-control-allow-credentials": "true",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
-      }
-    }
-  },
-  "80ef172601f7da76a98276d2a1af8b603caf29350fc981b3de7d327c87e0126c": {
-    "url": "https://api-inference.huggingface.co/models/google/tapas-base-finetuned-wtq",
+  "0588849bd8db5bca4bbb36916af37a03031b04788867d0db6634ff93cf19ded0": {
+    "url": "https://api.replicate.com/v1/predictions",
     "init": {
       "headers": {
         "Content-Type": "application/json",
-        "X-Wait-For-Model": "true"
+        "Prefer": "wait"
       },
       "method": "POST",
-      "body": "{\"inputs\":{\"question\":\"How many stars does the transformers repository have?\",\"table\":{\"Repository\":[\"Transformers\",\"Datasets\",\"Tokenizers\"],\"Stars\":[\"36542\",\"4512\",\"3934\"],\"Contributors\":[\"651\",\"77\",\"34\"],\"Programming language\":[\"Python\",\"Python\",\"Rust, Python and NodeJS\"]}}}"
+      "body": "{\"input\":{\"inputs\":\"OuteTTS is a frontier TTS model for its size of 1 Billion parameters\"},\"version\":\"39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26\"}"
     },
     "response": {
-      "body": "{\"error\":\"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but is <class 'dict'>)\",\"warnings\":[\"There was an inference error: Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but is <class 'dict'>)\"]}",
-      "status": 400,
-      "statusText": "Bad Request",
+      "body": "{\"id\":\"vxnyb0rbe9rm80cmgj1vs1t53w\",\"model\":\"jbilcke/oute-tts\",\"version\":\"39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26\",\"input\":{\"inputs\":\"OuteTTS is a frontier TTS model for its size of 1 Billion parameters\"},\"logs\":\"\",\"output\":\"https://replicate.delivery/xezq/U5zqJthcGtqOAJcfYTF6Xudm8txQmSELa9oqcxR6ZWXZThDKA/output.wav\",\"data_removed\":false,\"error\":null,\"status\":\"processing\",\"created_at\":\"2025-01-20T16:19:28.242Z\",\"urls\":{\"cancel\":\"https://api.replicate.com/v1/predictions/vxnyb0rbe9rm80cmgj1vs1t53w/cancel\",\"get\":\"https://api.replicate.com/v1/predictions/vxnyb0rbe9rm80cmgj1vs1t53w\",\"stream\":\"https://stream.replicate.com/v1/files/bcwr-4lk3m6mpf6hkv27z2a625rwft7baeha2ryx5nd6pdpnfxt3wqbja\"}}",
+      "status": 201,
+      "statusText": "Created",
       "headers": {
-        "access-control-allow-credentials": "true",
-        "access-control-expose-headers": "x-compute-type, x-compute-time",
+        "alt-svc": "h3=\":443\"; ma=86400",
+        "cf-cache-status": "DYNAMIC",
+        "cf-ray": "905062e3cfc2d642-CDG",
         "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+        "content-type": "application/json; charset=UTF-8",
+        "nel": "{\"success_fraction\":0,\"report_to\":\"cf-nel\",\"max_age\":604800}",
+        "preference-applied": "wait=60",
+        "ratelimit-remaining": "599",
+        "ratelimit-reset": "1",
+        "report-to": "{\"endpoints\":[{\"url\":\"https:\\/\\/a.nel.cloudflare.com\\/report\\/v4?s=OXZ56%2FXmwQi53DKlAoxLJ9Ib85Mm0WyD8%2F6BfygHgwaealJK7sn4mztJhIlQybIWJwTUdh1m%2B2XxemHSbupiRN5lMqyLYNLeH3u6WYxkUFOK6v%2FpSsjN9D27mvwJ2JBsfmGnDGbc4AhqoTLMSpmi\"}],\"group\":\"cf-nel\",\"max_age\":604800}",
+        "server": "cloudflare",
+        "server-timing": "cfL4;desc=\"?proto=TCP&rtt=5348&min_rtt=5271&rtt_var=2131&sent=5&recv=5&lost=0&retrans=0&sent_bytes=2849&recv_bytes=979&delivery_rate=686704&cwnd=252&unsent_bytes=0&cid=08dbc93a75c5b1df&ts=34187&x=0\"",
+        "strict-transport-security": "max-age=15552000",
+        "vary": "Accept-Encoding"
       }
     }
   },
-  "cc7951fc493e082384f8839058a329eabaa7a13830bdf1eb03d87bc73b3fc327": {
-    "url": "https://api-inference.huggingface.co/models/sentence-transformers/paraphrase-xlm-r-multilingual-v1",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json",
-        "X-Wait-For-Model": "true"
-      },
-      "method": "POST",
-      "body": "{\"inputs\":{\"sourceSentence\":\"That is a happy person\",\"sentences\":[\"That is a happy dog\",\"That is a very happy person\",\"Today is a sunny day\"],\"source_sentence\":\"That is a happy person\"}}"
-    },
+  "89c2957dc10eb8b7ac6415cdd14447d4c54437354183fe0c7d0c3ef78cf34ad2": {
+    "url": "https://replicate.delivery/xezq/U5zqJthcGtqOAJcfYTF6Xudm8txQmSELa9oqcxR6ZWXZThDKA/output.wav",
+    "init": {},
     "response": {
-      "body": "[0.6623499989509583,0.9382342100143433,0.2296333760023117]",
+      "body": "",
       "status": 200,
       "statusText": "OK",
       "headers": {
-        "access-control-allow-credentials": "true",
-        "access-control-expose-headers": "x-compute-type, x-compute-time",
-        "connection": "keep-alive",
-        "content-type": "application/json",
-        "server": "uvicorn",
-        "transfer-encoding": "chunked",
-        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+        "accept-ranges": "bytes",
+        "access-control-allow-origin": "*",
+        "alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000",
+        "cache-control": "public,max-age=3600",
+        "cache-id": "PAR-31976c84",
+        "cache-status": "miss",
+        "content-type": "audio/x-wav",
+        "etag": "\"f118ce7abd9171ff463e1319fd4c27cc\"",
+        "last-modified": "Mon, 20 Jan 2025 16:20:02 GMT",
+        "server": "UploadServer"
       }
     }
   },
@@ -4580,7 +4347,7 @@
       "body": "{\"prompt\":\"black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot\",\"response_format\":\"base64\"}"
     },
     "response": {
-      "body": "{\"images\":[{\"url\":\"https://fal.media/files/rabbit/oq94iUr2TOJTS7oT_NX27.png\",\"width\":1024,\"height\":768,\"content_type\":\"image/jpeg\"}],\"timings\":{\"inference\":0.3473736699670553},\"seed\":961394545,\"has_nsfw_concepts\":[false],\"prompt\":\"black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot\"}",
+      "body": "{\"images\":[{\"url\":\"https://fal.media/files/lion/diFpxNG0A6E45szVv6Zee.png\",\"width\":1024,\"height\":768,\"content_type\":\"image/jpeg\"}],\"timings\":{\"inference\":0.3554951280821115},\"seed\":288907632,\"has_nsfw_concepts\":[false],\"prompt\":\"black forest gateau cake spelling out the words FLUX SCHNELL, tasty, food photography, dynamic shot\"}",
       "status": 200,
       "statusText": "OK",
       "headers": {
@@ -4590,8 +4357,8 @@
       }
     }
   },
-  "193e46b3d59b3c99dd1e75f33b140eafa529972cd5f9919b539f961dda2a9f21": {
-    "url": "https://fal.media/files/rabbit/oq94iUr2TOJTS7oT_NX27.png",
+  "7f69e94a720f6c2c4702c164c1004dd035606c8b53bf31beb27f497c60c834cd": {
+    "url": "https://fal.media/files/lion/diFpxNG0A6E45szVv6Zee.png",
     "init": {},
     "response": {
       "body": "",
@@ -4602,7 +4369,7 @@
         "access-control-allow-methods": "*",
         "access-control-allow-origin": "*",
         "access-control-max-age": "86400",
-        "cf-ray": "90510f76a82cd696-CDG",
+        "cf-ray": "90506d1b8b2a6981-CDG",
         "connection": "keep-alive",
         "content-type": "image/jpeg",
         "server": "cloudflare",
@@ -4620,18 +4387,18 @@
       "body": "{\"prompt\":\"award winning high resolution photo of a giant tortoise\",\"response_format\":\"base64\",\"model\":\"stabilityai/stable-diffusion-xl-base-1.0\"}"
     },
     "response": {
-      "body": "{\"id\":\"90510f7d49ae006f-CDG\",\"model\":\"stabilityai/stable-diffusion-xl-base-1.0\",\"object\":\"list\",\"data\":[{\"timings\":{\"inference\":4447},\"index\":0,\"b64_json\":\"/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAQABAADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDmBPsXIoXV9nVcj1qqpBTHpUbLuyCOfepsZmzDqsMw27gG9GqtqSxyRl/MYEc9ePyrEdSjZBxTZHaZApOPc5xRYCpPJl+GY47kYpiBGX5ic1YnsXRN4YOMZ4zVQEqf6VQxrDB4NAOKesJk5JA+pppXDYPWmBPExyKuxE46iqUK\"}]}",
+      "body": "{\"id\":\"90506d280b7b2298-CDG\",\"model\":\"stabilityai/stable-diffusion-xl-base-1.0\",\"object\":\"list\",\"data\":[{\"timings\":{\"inference\":4469},\"index\":0,\"b64_json\":\"/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAQABAADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDlh2OKQkZ4pi5MZPcGnDOK5Ch4XceeKcBzTd22kUkmgC0hVRk09JfmOMY9Kr/Me5P409I3ZsDINKwxztzTVRnyw6euQKVouM5Jp65WMDB9qAGIdpPJJHel83PakDDkY/M0wjJ4ppgS7u9OU/8A66iBx6U7dQBOAMc05cE4qAMa\"}]}",
       "status": 200,
       "statusText": "OK",
       "headers": {
         "access-control-allow-origin": "*",
         "alt-svc": "h3=\":443\"; ma=86400",
         "cf-cache-status": "DYNAMIC",
-        "cf-ray": "90510f7d49ae006f-CDG",
+        "cf-ray": "90506d280b7b2298-CDG",
         "connection": "keep-alive",
         "content-encoding": "gzip",
         "content-type": "application/json; charset=utf-8",
-        "etag": "W/\"2872f-NGi81NT7Dqn2r2C0juJO/wtH8+A\"",
+        "etag": "W/\"1e79b-iMsh29xGKx5YyNetamSki3RBjcM\"",
         "retry-after": "2",
         "server": "cloudflare",
         "strict-transport-security": "max-age=15552000; includeSubDomains",
@@ -4639,8 +4406,8 @@
       }
     }
   },
-  "d945928c64e9312ce2e915dab66f2e4e047d99cac0f8fcf7c5fec7c724c371b1": {
-    "url": "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAQABAADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDmBPsXIoXV9nVcj1qqpBTHpUbLuyCOfepsZmzDqsMw27gG9GqtqSxyRl/MYEc9ePyrEdSjZBxTZHaZApOPc5xRYCpPJl+GY47kYpiBGX5ic1YnsXRN4YOMZ4zVQEqf6VQxrDB4NAOKesJk5JA+pppXDYPWmBPExyKuxE46iqUK",
+  "45fbd4bfe447bc77eb34c8638a144658ed90fcd6d8163f907228a4bdff595518": {
+    "url": "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAQABAADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDlh2OKQkZ4pi5MZPcGnDOK5Ch4XceeKcBzTd22kUkmgC0hVRk09JfmOMY9Kr/Me5P409I3ZsDINKwxztzTVRnyw6euQKVouM5Jp65WMDB9qAGIdpPJJHel83PakDDkY/M0wjJ4ppgS7u9OU/8A66iBx6U7dQBOAMc05cE4qAMa",
     "init": {},
     "response": {
       "body": "",
@@ -4650,58 +4417,5 @@
         "content-type": "image/jpeg"
       }
     }
-  },
-  "0588849bd8db5bca4bbb36916af37a03031b04788867d0db6634ff93cf19ded0": {
-    "url": "https://api.replicate.com/v1/predictions",
-    "init": {
-      "headers": {
-        "Content-Type": "application/json",
-        "Prefer": "wait"
-      },
-      "method": "POST",
-      "body": "{\"input\":{\"inputs\":\"OuteTTS is a frontier TTS model for its size of 1 Billion parameters\"},\"version\":\"39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26\"}"
-    },
-    "response": {
-      "body": "{\"id\":\"91g3v0rcv9rme0cmgksrmsygtc\",\"model\":\"jbilcke/oute-tts\",\"version\":\"39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26\",\"input\":{\"inputs\":\"OuteTTS is a frontier TTS model for its size of 1 Billion parameters\"},\"logs\":\"\",\"output\":\"https://replicate.delivery/xezq/Walbi1xsiJpnOpaRoIZanwV2wDznhRtSqL3a0yekv0wgMiDKA/output.wav\",\"data_removed\":false,\"error\":null,\"status\":\"processing\",\"created_at\":\"2025-01-20T18:21:48.634Z\",\"urls\":{\"cancel\":\"https://api.replicate.com/v1/predictions/91g3v0rcv9rme0cmgksrmsygtc/cancel\",\"get\":\"https://api.replicate.com/v1/predictions/91g3v0rcv9rme0cmgksrmsygtc\",\"stream\":\"https://stream.replicate.com/v1/files/bcwr-k266snnkiye47r7hrf7itzwfjqeoeyk3w4zvrkagx4am3mf2w2vq\"}}",
-      "status": 201,
-      "statusText": "Created",
-      "headers": {
-        "alt-svc": "h3=\":443\"; ma=86400",
-        "cf-cache-status": "DYNAMIC",
-        "cf-ray": "905116195e7df09f-CDG",
-        "connection": "keep-alive",
-        "content-type": "application/json; charset=UTF-8",
-        "nel": "{\"success_fraction\":0,\"report_to\":\"cf-nel\",\"max_age\":604800}",
-        "preference-applied": "wait=60",
-        "ratelimit-remaining": "599",
-        "ratelimit-reset": "1",
-        "report-to": "{\"endpoints\":[{\"url\":\"https:\\/\\/a.nel.cloudflare.com\\/report\\/v4?s=V8mPG5cIfa7uPl7uQKItykjChzevVXF2yID2y%2B5CZTTly8CmMtKV8uccLALG7nzl0UQ7%2B7EBCazWBPFmxACoq6iyZThU5MLkOs8VfWF8fzJrX0sCq6TpfaoHvcK56bjzQjxj\"}],\"group\":\"cf-nel\",\"max_age\":604800}",
-        "server": "cloudflare",
-        "server-timing": "cfL4;desc=\"?proto=TCP&rtt=6405&min_rtt=6145&rtt_var=2490&sent=4&recv=5&lost=0&retrans=0&sent_bytes=2848&recv_bytes=982&delivery_rate=471277&cwnd=233&unsent_bytes=0&cid=60e9a77fe6f4b108&ts=5471&x=0\"",
-        "strict-transport-security": "max-age=15552000",
-        "vary": "Accept-Encoding"
-      }
-    }
-  },
-  "37bbea66468f59310614e1cc5b028a8d1eb7c9288cbf86c40cc15fba4fbeb317": {
-    "url": "https://replicate.delivery/xezq/Walbi1xsiJpnOpaRoIZanwV2wDznhRtSqL3a0yekv0wgMiDKA/output.wav",
-    "init": {},
-    "response": {
-      "body": "",
-      "status": 200,
-      "statusText": "OK",
-      "headers": {
-        "accept-ranges": "bytes",
-        "access-control-allow-origin": "*",
-        "alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000",
-        "cache-control": "public,max-age=3600",
-        "cache-id": "PAR-31976c84",
-        "cache-status": "miss",
-        "content-type": "audio/x-wav",
-        "etag": "\"89805c92016393f733fd73b259cdc5a6\"",
-        "last-modified": "Mon, 20 Jan 2025 18:21:53 GMT",
-        "server": "UploadServer"
-      }
-    }
   }
 }
\ No newline at end of file

From 55b722d2ac7ce08ce3731921c2a50e56f1737548 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 15:02:49 +0100
Subject: [PATCH 13/23] fix textToSpeech fal

---
 .../tasks/audio/automaticSpeechRecognition.ts | 49 +++++++++++++------
 packages/inference/src/types.ts               |  2 +-
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
index 70eb4df296..0bba3c638b 100644
--- a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
+++ b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
@@ -1,10 +1,11 @@
 import type { AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
-import type { BaseArgs, Options } from "../../types";
+import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
 import type { LegacyAudioInput } from "./utils";
 import { preparePayload } from "./utils";
+import { omit } from "../../utils/omit";
 
 export type AutomaticSpeechRecognitionArgs = BaseArgs & (AutomaticSpeechRecognitionInput | LegacyAudioInput);
 /**
@@ -15,21 +16,8 @@ export async function automaticSpeechRecognition(
 	args: AutomaticSpeechRecognitionArgs,
 	options?: Options
 ): Promise<AutomaticSpeechRecognitionOutput> {
-	const payload = preparePayload(args);
-	if (args.provider === "fal-ai") {
-		const contentType = args.inputs.type;
-		if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
-			throw new Error(
-				`Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
-					", "
-				)}`
-			);
-		}
-		const base64audio = base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer()));
-		(args as AutomaticSpeechRecognitionArgs & { audio_url: string }).audio_url =
-			`data:${contentType};base64,${base64audio}`;
-	}
-	const res = await request<AutomaticSpeechRecognitionOutput>(payload as AutomaticSpeechRecognitionArgs, {
+	const payload = await buildPayload(args);
+	const res = await request<AutomaticSpeechRecognitionOutput>(payload, {
 		...options,
 		taskHint: "automatic-speech-recognition",
 	});
@@ -41,3 +29,32 @@ export async function automaticSpeechRecognition(
 }
 
 const FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
+
+
+async function buildPayload(
+	args: AutomaticSpeechRecognitionArgs,
+): Promise<RequestArgs> {
+	if (args.provider === "fal-ai") {
+		const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : undefined;
+		const contentType = blob?.type;
+		if (!contentType) {
+			throw new Error(
+				`Unable to determine the input's content-type. Make sure your are passing a Blob when using provider fal-ai.`
+			);
+		}
+		if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
+			throw new Error(
+				`Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
+					", "
+				)}`
+			);
+		}
+		const base64audio = base64FromBytes(new Uint8Array(await blob.arrayBuffer()));
+		return {
+			...("data" in args ? omit(args, "data") : omit(args, "inputs")),
+			audio_url: `data:${contentType};base64,${base64audio}`
+		}
+	} else {
+		return preparePayload(args);
+	}
+}
\ No newline at end of file
diff --git a/packages/inference/src/types.ts b/packages/inference/src/types.ts
index 3b70538b38..e59bd6b13f 100644
--- a/packages/inference/src/types.ts
+++ b/packages/inference/src/types.ts
@@ -84,7 +84,7 @@ export interface BaseArgs {
 }
 
 export type RequestArgs = BaseArgs &
-	({ data: Blob | ArrayBuffer } | { inputs: unknown } | ChatCompletionInput) & {
+	({ data: Blob | ArrayBuffer } | { inputs: unknown } | ChatCompletionInput | { audio_url: string }) & {
 		parameters?: Record<string, unknown>;
 		accessToken?: string;
 	};

From 8181201c7e83eaf166f4e80cc1c7269c07f51a21 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 15:05:24 +0100
Subject: [PATCH 14/23] homogenize test name

---
 packages/inference/test/HfInference.spec.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index adc498451b..02471dea48 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -330,7 +330,7 @@ describe.concurrent("HfInference", () => {
 					])
 				);
 			});
-			it("SentenceSimilarity", async () => {
+			it("sentenceSimilarity", async () => {
 				expect(
 					await hf.sentenceSimilarity({
 						model: "sentence-transformers/paraphrase-xlm-r-multilingual-v1",

From c6aaa76b85e58581f204781c60e1634c2740af34 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 15:25:04 +0100
Subject: [PATCH 15/23] format

---
 .../inference/src/tasks/audio/audioToAudio.ts | 14 ++++++++------
 .../tasks/audio/automaticSpeechRecognition.ts | 11 ++++-------
 packages/inference/src/tasks/audio/utils.ts   | 12 +++++++-----
 .../tasks/cv/zeroShotImageClassification.ts   | 19 +++++++++----------
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/packages/inference/src/tasks/audio/audioToAudio.ts b/packages/inference/src/tasks/audio/audioToAudio.ts
index 8ae6b15de5..577e52318c 100644
--- a/packages/inference/src/tasks/audio/audioToAudio.ts
+++ b/packages/inference/src/tasks/audio/audioToAudio.ts
@@ -4,12 +4,14 @@ import { request } from "../custom/request";
 import type { LegacyAudioInput } from "./utils";
 import { preparePayload } from "./utils";
 
-export type AudioToAudioArgs = BaseArgs & {
-	/**
-	 * Binary audio data
-	 */
-	inputs: Blob;
-} | LegacyAudioInput;
+export type AudioToAudioArgs =
+	| (BaseArgs & {
+			/**
+			 * Binary audio data
+			 */
+			inputs: Blob;
+	  })
+	| LegacyAudioInput;
 
 export interface AudioToAudioOutputElem {
 	/**
diff --git a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
index 0bba3c638b..b9f17d9f1d 100644
--- a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
+++ b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts
@@ -30,10 +30,7 @@ export async function automaticSpeechRecognition(
 
 const FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
 
-
-async function buildPayload(
-	args: AutomaticSpeechRecognitionArgs,
-): Promise<RequestArgs> {
+async function buildPayload(args: AutomaticSpeechRecognitionArgs): Promise<RequestArgs> {
 	if (args.provider === "fal-ai") {
 		const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : undefined;
 		const contentType = blob?.type;
@@ -52,9 +49,9 @@ async function buildPayload(
 		const base64audio = base64FromBytes(new Uint8Array(await blob.arrayBuffer()));
 		return {
 			...("data" in args ? omit(args, "data") : omit(args, "inputs")),
-			audio_url: `data:${contentType};base64,${base64audio}`
-		}
+			audio_url: `data:${contentType};base64,${base64audio}`,
+		};
 	} else {
 		return preparePayload(args);
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/inference/src/tasks/audio/utils.ts b/packages/inference/src/tasks/audio/utils.ts
index 58716efd3c..b8dcd10501 100644
--- a/packages/inference/src/tasks/audio/utils.ts
+++ b/packages/inference/src/tasks/audio/utils.ts
@@ -5,12 +5,14 @@ import { omit } from "../../utils/omit";
  * @deprecated
  */
 export interface LegacyAudioInput {
-	data: Blob | ArrayBuffer
+	data: Blob | ArrayBuffer;
 }
 
 export function preparePayload(args: BaseArgs & ({ inputs: Blob } | LegacyAudioInput)): RequestArgs {
-	return "data" in args ? args : {
-		...omit(args, "inputs"),
-		data: args.inputs
-	}
+	return "data" in args
+		? args
+		: {
+				...omit(args, "inputs"),
+				data: args.inputs,
+		  };
 }
diff --git a/packages/inference/src/tasks/cv/zeroShotImageClassification.ts b/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
index 3ee4e0d9e0..7aa3e14bbe 100644
--- a/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
+++ b/packages/inference/src/tasks/cv/zeroShotImageClassification.ts
@@ -9,21 +9,20 @@ import type { ZeroShotImageClassificationInput, ZeroShotImageClassificationOutpu
  * @deprecated
  */
 interface LegacyZeroShotImageClassificationInput {
-	inputs: { image: Blob | ArrayBuffer }
+	inputs: { image: Blob | ArrayBuffer };
 }
 
-export type ZeroShotImageClassificationArgs = BaseArgs & (ZeroShotImageClassificationInput | LegacyZeroShotImageClassificationInput);
+export type ZeroShotImageClassificationArgs = BaseArgs &
+	(ZeroShotImageClassificationInput | LegacyZeroShotImageClassificationInput);
 
 async function preparePayload(args: ZeroShotImageClassificationArgs): Promise<RequestArgs> {
 	if (args.inputs instanceof Blob) {
 		return {
 			...args,
 			inputs: {
-				image: base64FromBytes(
-					new Uint8Array(await args.inputs.arrayBuffer())
-				)
-			}
-		}
+				image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer())),
+			},
+		};
 	} else {
 		return {
 			...args,
@@ -32,9 +31,9 @@ async function preparePayload(args: ZeroShotImageClassificationArgs): Promise<Re
 					new Uint8Array(
 						args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
 					)
-				)
+				),
 			},
-		}
+		};
 	}
 }
 
@@ -46,7 +45,7 @@ export async function zeroShotImageClassification(
 	args: ZeroShotImageClassificationArgs,
 	options?: Options
 ): Promise<ZeroShotImageClassificationOutput> {
-	const payload = await preparePayload(args)
+	const payload = await preparePayload(args);
 	const res = await request<ZeroShotImageClassificationOutput>(payload, {
 		...options,
 		taskHint: "zero-shot-image-classification",

From 74cdef413f698655ad4890213c21f8da30aba08f Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 15:25:17 +0100
Subject: [PATCH 16/23] update tests: question answering tasks

---
 packages/inference/test/HfInference.spec.ts | 71 ++++++++++++++-------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index 02471dea48..8ce5c05ebf 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -1,6 +1,13 @@
 import { expect, it, describe, assert } from "vitest";
 
-import type { ChatCompletionStreamOutput } from "@huggingface/tasks";
+import type {
+	ChatCompletionStreamOutput,
+	DocumentQuestionAnsweringOutputElement,
+	QuestionAnsweringOutputElement,
+	TableQuestionAnsweringOutput,
+	TableQuestionAnsweringOutputElement,
+	VisualQuestionAnsweringOutputElement,
+} from "@huggingface/tasks";
 
 import { chatCompletion, HfInference } from "../src";
 import "./vcr";
@@ -88,12 +95,16 @@ describe.concurrent("HfInference", () => {
 							context: "The capital of France is Paris.",
 						},
 					})
-				).toMatchObject({
-					answer: "Paris",
-					score: expect.any(Number),
-					start: expect.any(Number),
-					end: expect.any(Number),
-				});
+				).toEqual(
+					expect.arrayContaining<QuestionAnsweringOutputElement>([
+						{
+							answer: expect.any(String),
+							score: expect.any(Number),
+							start: expect.any(Number),
+							end: expect.any(Number),
+						},
+					])
+				);
 			});
 
 			it("tableQuestionAnswering", async () => {
@@ -110,12 +121,16 @@ describe.concurrent("HfInference", () => {
 							},
 						},
 					})
-				).toMatchObject({
-					answer: "AVERAGE > 36542",
-					coordinates: [[0, 1]],
-					cells: ["36542"],
-					aggregator: "AVERAGE",
-				});
+				).toEqual(
+					expect.arrayContaining<TableQuestionAnsweringOutputElement>([
+						{
+							answer: expect.any(String),
+							coordinates: [expect.arrayContaining([expect.any(Number)])],
+							cells: expect.arrayContaining([expect.any(String)]),
+							aggregator: expect.any(String),
+						},
+					])
+				);
 			});
 
 			it("documentQuestionAnswering", async () => {
@@ -127,13 +142,17 @@ describe.concurrent("HfInference", () => {
 							image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
 						},
 					})
-				).toMatchObject({
-					answer: "us-001",
-					score: expect.any(Number),
-					// not sure what start/end refers to in this case
-					start: expect.any(Number),
-					end: expect.any(Number),
-				});
+				).toEqual(
+					expect.arrayContaining<DocumentQuestionAnsweringOutputElement>([
+						{
+							answer: expect.any(String),
+							score: expect.any(Number),
+							// not sure what start/end refers to in this case
+							start: expect.any(Number),
+							end: expect.any(Number),
+						},
+					])
+				);
 			});
 
 			// Errors with "Error: If you are using a VisionEncoderDecoderModel, you must provide a feature extractor"
@@ -160,10 +179,14 @@ describe.concurrent("HfInference", () => {
 							image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
 						},
 					})
-				).toMatchObject({
-					answer: "2",
-					score: expect.any(Number),
-				});
+				).toEqual(
+					expect.arrayContaining<VisualQuestionAnsweringOutputElement>([
+						{
+							answer: expect.any(String),
+							score: expect.any(Number),
+						},
+					])
+				);
 			});
 
 			it("textClassification", async () => {

From 970a5a483f717ff87505587e20981b8200e41b26 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 15:38:44 +0100
Subject: [PATCH 17/23] fix sentenceSimilarity

---
 .../src/tasks/nlp/sentenceSimilarity.ts       |  5 +-
 packages/inference/test/tapes.json            | 48 +++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/packages/inference/src/tasks/nlp/sentenceSimilarity.ts b/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
index 5e524cbfcd..b0a39e1843 100644
--- a/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
+++ b/packages/inference/src/tasks/nlp/sentenceSimilarity.ts
@@ -30,7 +30,8 @@ export async function sentenceSimilarity(
 
 function prepareInput(args: SentenceSimilarityArgs) {
 	return {
-		...omit(args, "inputs"),
-		inputs: { ...args.inputs, source_sentence: args.inputs.sourceSentence },
+		...omit(args, ["inputs", "parameters"]),
+		inputs: { ...omit(args.inputs, "sourceSentence") },
+		parameters: { source_sentence: args.inputs.sourceSentence, ...args.parameters },
 	};
 }
diff --git a/packages/inference/test/tapes.json b/packages/inference/test/tapes.json
index 2c2454a836..245c27b918 100644
--- a/packages/inference/test/tapes.json
+++ b/packages/inference/test/tapes.json
@@ -4417,5 +4417,53 @@
         "content-type": "image/jpeg"
       }
     }
+  },
+  "90ce8b1408f4b64c5b6f386d6b3fecd9ae28666937767301b568d91a3618aee7": {
+    "url": "https://api-inference.huggingface.co/models/sentence-transformers/paraphrase-xlm-r-multilingual-v1",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json",
+        "X-Wait-For-Model": "true"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{\"sentences\":[\"That is a happy dog\",\"That is a very happy person\",\"Today is a sunny day\"]}}"
+    },
+    "response": {
+      "body": "{\"error\":[\"Field required: received `source_sentence` in `parameters`\"]}",
+      "status": 400,
+      "statusText": "Bad Request",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers, origin, access-control-request-method, access-control-request-headers"
+      }
+    }
+  },
+  "6cce6c346a50f52438b1c3b401b2db9f8ba3a39ebebf19bf94750bddf6f7c894": {
+    "url": "https://api-inference.huggingface.co/models/sentence-transformers/paraphrase-xlm-r-multilingual-v1",
+    "init": {
+      "headers": {
+        "Content-Type": "application/json"
+      },
+      "method": "POST",
+      "body": "{\"inputs\":{\"source_sentence\":\"That is a happy person\",\"sentences\":[\"That is a happy dog\",\"That is a very happy person\",\"Today is a sunny day\"]},\"parameters\":{}}"
+    },
+    "response": {
+      "body": "[0.6623499989509583,0.9382342100143433,0.2296333760023117]",
+      "status": 200,
+      "statusText": "OK",
+      "headers": {
+        "access-control-allow-credentials": "true",
+        "access-control-expose-headers": "x-compute-type, x-compute-time",
+        "connection": "keep-alive",
+        "content-type": "application/json",
+        "server": "uvicorn",
+        "transfer-encoding": "chunked",
+        "vary": "Origin, Access-Control-Request-Method, Access-Control-Request-Headers"
+      }
+    }
   }
 }
\ No newline at end of file

From 3cbe753783f75b1c03c01d32e91b80f6691aee4c Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 15:39:35 +0100
Subject: [PATCH 18/23] lint + format

---
 packages/agents/pnpm-lock.yaml | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/packages/agents/pnpm-lock.yaml b/packages/agents/pnpm-lock.yaml
index 455c7460ab..060aacb353 100644
--- a/packages/agents/pnpm-lock.yaml
+++ b/packages/agents/pnpm-lock.yaml
@@ -7,7 +7,7 @@ settings:
 dependencies:
   '@huggingface/inference':
     specifier: ^2.6.1
-    version: 2.8.1
+    version: link:../inference
 
 devDependencies:
   '@types/node':
@@ -16,17 +16,6 @@ devDependencies:
 
 packages:
 
-  /@huggingface/inference@2.8.1:
-    resolution: {integrity: sha512-EfsNtY9OR6JCNaUa5bZu2mrs48iqeTz0Gutwf+fU0Kypx33xFQB4DKMhp8u4Ee6qVbLbNWvTHuWwlppLQl4p4Q==}
-    engines: {node: '>=18'}
-    dependencies:
-      '@huggingface/tasks': 0.12.30
-    dev: false
-
-  /@huggingface/tasks@0.12.30:
-    resolution: {integrity: sha512-A1ITdxbEzx9L8wKR8pF7swyrTLxWNDFIGDLUWInxvks2ruQ8PLRBZe8r0EcjC3CDdtlj9jV1V4cgV35K/iy3GQ==}
-    dev: false
-
   /@types/node@18.13.0:
     resolution: {integrity: sha512-gC3TazRzGoOnoKAhUx+Q0t8S9Tzs74z7m0ipwGpSqQrleP14hKxP4/JUeEQcD3W1/aIpnWl8pHowI7WokuZpXg==}
     dev: true

From 241089fed18815233abb90babe348e55fe3f7a39 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 15:48:14 +0100
Subject: [PATCH 19/23] revert changes

---
 .../inference/src/lib/makeRequestOptions.ts   | 23 +++++++++++--------
 packages/inference/test/HfInference.spec.ts   |  1 -
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/packages/inference/src/lib/makeRequestOptions.ts b/packages/inference/src/lib/makeRequestOptions.ts
index 38e5d24af2..438d6b76be 100644
--- a/packages/inference/src/lib/makeRequestOptions.ts
+++ b/packages/inference/src/lib/makeRequestOptions.ts
@@ -20,7 +20,10 @@ let tasks: Record<string, { models: { id: string }[] }> | null = null;
  * Helper that prepares request arguments
  */
 export async function makeRequestOptions(
-	args: RequestArgs,
+	args: RequestArgs & {
+		data?: Blob | ArrayBuffer;
+		stream?: boolean;
+	},
 	options?: Options & {
 		/** When a model can be used for multiple tasks, and we want to run a non-default task */
 		forceTask?: string | InferenceTask;
@@ -39,6 +42,9 @@ export async function makeRequestOptions(
 	if (endpointUrl && provider !== "hf-inference") {
 		throw new Error(`Cannot use endpointUrl with a third-party provider.`);
 	}
+	if (forceTask && provider !== "hf-inference") {
+		throw new Error(`Cannot use forceTask with a third-party provider.`);
+	}
 	if (maybeModel && isUrl(maybeModel)) {
 		throw new Error(`Model URLs are no longer supported. Use endpointUrl instead.`);
 	}
@@ -83,7 +89,7 @@ export async function makeRequestOptions(
 			provider === "fal-ai" && authMethod === "provider-key" ? `Key ${accessToken}` : `Bearer ${accessToken}`;
 	}
 
-	const binary = "data" in args && !!args.data && args.data instanceof Blob;
+	const binary = "data" in args && !!args.data;
 
 	if (!binary) {
 		headers["Content-Type"] = "application/json";
@@ -127,13 +133,12 @@ export async function makeRequestOptions(
 	const info: RequestInit = {
 		headers,
 		method: "POST",
-		body:
-			"data" in args && args.data instanceof Blob
-				? args.data
-				: JSON.stringify({
-						...otherArgs,
-						...(chatCompletion || provider === "together" ? { model } : undefined),
-				  }),
+		body: binary
+			? args.data
+			: JSON.stringify({
+					...otherArgs,
+					...(chatCompletion || provider === "together" ? { model } : undefined),
+			  }),
 		...(credentials ? { credentials } : undefined),
 		signal: options?.signal,
 	};
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index 8ce5c05ebf..144e2ba029 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -4,7 +4,6 @@ import type {
 	ChatCompletionStreamOutput,
 	DocumentQuestionAnsweringOutputElement,
 	QuestionAnsweringOutputElement,
-	TableQuestionAnsweringOutput,
 	TableQuestionAnsweringOutputElement,
 	VisualQuestionAnsweringOutputElement,
 } from "@huggingface/tasks";

From bebadddb7d1b43cd3b875f3ed4871c07496d9fa5 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 21 Jan 2025 15:52:12 +0100
Subject: [PATCH 20/23] rm console logs

---
 packages/tasks-gen/scripts/inference-codegen.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/packages/tasks-gen/scripts/inference-codegen.ts b/packages/tasks-gen/scripts/inference-codegen.ts
index 2273a4cd6d..df5f2492b3 100644
--- a/packages/tasks-gen/scripts/inference-codegen.ts
+++ b/packages/tasks-gen/scripts/inference-codegen.ts
@@ -146,7 +146,6 @@ async function generateBinaryInputTypes(
 			}
 			const propName = propSignature.name.getText(tsSource);
 			const propIsMedia = !!spec["properties"]?.[propName]?.["comment"]?.includes("type=binary");
-			console.log(propName, propIsMedia);
 			if (!propIsMedia) {
 				return;
 			}

From f21d0e29c0825646e6fa76a582c39206bfcf1f2b Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 23 Jan 2025 17:25:04 +0100
Subject: [PATCH 21/23] lint!

---
 packages/inference/src/tasks/cv/textToImage.ts      | 10 +++++-----
 packages/inference/src/types.ts                     |  8 +++++++-
 packages/tasks/src/model-libraries-snippets.ts      |  2 +-
 packages/tasks/src/tasks/text-to-video/inference.ts |  3 ---
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/packages/inference/src/tasks/cv/textToImage.ts b/packages/inference/src/tasks/cv/textToImage.ts
index f8ccf83f40..c8e2d3cbf9 100644
--- a/packages/inference/src/tasks/cv/textToImage.ts
+++ b/packages/inference/src/tasks/cv/textToImage.ts
@@ -23,11 +23,11 @@ export async function textToImage(args: TextToImageArgs, options?: Options): Pro
 	const payload =
 		args.provider === "together" || args.provider === "fal-ai" || args.provider === "replicate"
 			? {
-				...omit(args, ["inputs", "parameters"]),
-				...args.parameters,
-				...(args.provider !== "replicate" ? { response_format: "base64" } : undefined),
-				prompt: args.inputs,
-			}
+					...omit(args, ["inputs", "parameters"]),
+					...args.parameters,
+					...(args.provider !== "replicate" ? { response_format: "base64" } : undefined),
+					prompt: args.inputs,
+			  }
 			: args;
 	const res = await request<TextToImageOutput | Base64ImageGeneration | OutputUrlImageGeneration>(payload, {
 		...options,
diff --git a/packages/inference/src/types.ts b/packages/inference/src/types.ts
index 6d982bfcbe..1f5e6b863e 100644
--- a/packages/inference/src/types.ts
+++ b/packages/inference/src/types.ts
@@ -84,7 +84,13 @@ export interface BaseArgs {
 }
 
 export type RequestArgs = BaseArgs &
-	({ data: Blob | ArrayBuffer } | { inputs: unknown } | { prompt: string } | { audio_url: string } | ChatCompletionInput) & {
+	(
+		| { data: Blob | ArrayBuffer }
+		| { inputs: unknown }
+		| { prompt: string }
+		| { audio_url: string }
+		| ChatCompletionInput
+	) & {
 		parameters?: Record<string, unknown>;
 		accessToken?: string;
 	};
diff --git a/packages/tasks/src/model-libraries-snippets.ts b/packages/tasks/src/model-libraries-snippets.ts
index b8bf9c81f3..6e8b44ca73 100644
--- a/packages/tasks/src/model-libraries-snippets.ts
+++ b/packages/tasks/src/model-libraries-snippets.ts
@@ -211,7 +211,7 @@ input_tensor = tf.train.Example(
 loaded_model = from_pretrained_keras("google/derm-foundation")
 infer = loaded_model.signatures["serving_default"]
 print(infer(inputs=tf.constant([input_tensor])))`,
-]
+];
 
 const diffusersDefaultPrompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k";
 
diff --git a/packages/tasks/src/tasks/text-to-video/inference.ts b/packages/tasks/src/tasks/text-to-video/inference.ts
index 532f929a0f..db4c7aec9f 100644
--- a/packages/tasks/src/tasks/text-to-video/inference.ts
+++ b/packages/tasks/src/tasks/text-to-video/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text To Video inference
  */
@@ -18,7 +17,6 @@ export interface TextToVideoInput {
 	parameters?: TextToVideoParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters for Text To Video
  */
@@ -47,7 +45,6 @@ export interface TextToVideoParameters {
 	seed?: number;
 	[property: string]: unknown;
 }
-
 /**
  * Outputs of inference for the Text To Video task
  */

From a9aeee16a50047d7a6f920b0c342d83852cd5f7b Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 23 Jan 2025 17:40:12 +0100
Subject: [PATCH 22/23] No breaking changes

---
 .../inference/src/tasks/audio/audioToAudio.ts | 43 +++--------
 .../multimodal/documentQuestionAnswering.ts   |  4 +-
 .../multimodal/visualQuestionAnswering.ts     |  4 +-
 .../src/tasks/nlp/questionAnswering.ts        | 30 ++++----
 .../src/tasks/nlp/tableQuestionAnswering.ts   |  4 +-
 packages/inference/test/HfInference.spec.ts   | 77 +++++++------------
 6 files changed, 59 insertions(+), 103 deletions(-)

diff --git a/packages/inference/src/tasks/audio/audioToAudio.ts b/packages/inference/src/tasks/audio/audioToAudio.ts
index 577e52318c..2f426b5205 100644
--- a/packages/inference/src/tasks/audio/audioToAudio.ts
+++ b/packages/inference/src/tasks/audio/audioToAudio.ts
@@ -6,11 +6,11 @@ import { preparePayload } from "./utils";
 
 export type AudioToAudioArgs =
 	| (BaseArgs & {
-			/**
-			 * Binary audio data
-			 */
-			inputs: Blob;
-	  })
+		/**
+		 * Binary audio data
+		 */
+		inputs: Blob;
+	})
 	| LegacyAudioInput;
 
 export interface AudioToAudioOutputElem {
@@ -25,9 +25,7 @@ export interface AudioToAudioOutputElem {
 	audio: Blob;
 }
 
-export type AudioToAudioOutput = AudioToAudioOutputElem[];
-
-interface LegacyOutput {
+export interface AudioToAudioOutput {
 	blob: string;
 	"content-type": string;
 	label: string;
@@ -37,9 +35,9 @@ interface LegacyOutput {
  * This task reads some audio input and outputs one or multiple audio files.
  * Example model: speechbrain/sepformer-wham does audio source separation.
  */
-export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput> {
+export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput[]> {
 	const payload = preparePayload(args);
-	const res = await request<AudioToAudioOutput | LegacyOutput[]>(payload, {
+	const res = await request<AudioToAudioOutput>(payload, {
 		...options,
 		taskHint: "audio-to-audio",
 	});
@@ -47,26 +45,12 @@ export async function audioToAudio(args: AudioToAudioArgs, options?: Options): P
 	return validateOutput(res);
 }
 
-function validateOutput(output: unknown): AudioToAudioOutput {
+function validateOutput(output: unknown): AudioToAudioOutput[] {
 	if (!Array.isArray(output)) {
 		throw new InferenceOutputError("Expected Array");
 	}
 	if (
-		output.every((elem): elem is AudioToAudioOutputElem => {
-			return (
-				typeof elem === "object" &&
-				elem &&
-				"label" in elem &&
-				typeof elem.label === "string" &&
-				"audio" in elem &&
-				elem.audio instanceof Blob
-			);
-		})
-	) {
-		return output;
-	}
-	if (
-		output.every((elem): elem is LegacyOutput => {
+		!output.every((elem): elem is AudioToAudioOutput => {
 			return (
 				typeof elem === "object" &&
 				elem &&
@@ -79,10 +63,7 @@ function validateOutput(output: unknown): AudioToAudioOutput {
 			);
 		})
 	) {
-		return output.map((elem) => ({
-			label: elem.label,
-			audio: new Blob([elem.blob], { type: elem["content-type"] }),
-		}));
+		throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
 	}
-	throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
+	return output
 }
diff --git a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
index 60d4aba008..de02c383e2 100644
--- a/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/documentQuestionAnswering.ts
@@ -20,7 +20,7 @@ export type DocumentQuestionAnsweringArgs = BaseArgs &
 export async function documentQuestionAnswering(
 	args: DocumentQuestionAnsweringArgs,
 	options?: Options
-): Promise<DocumentQuestionAnsweringOutput> {
+): Promise<DocumentQuestionAnsweringOutput[number]> {
 	const reqArgs: RequestArgs = {
 		...args,
 		inputs: {
@@ -51,5 +51,5 @@ export async function documentQuestionAnswering(
 		throw new InferenceOutputError("Expected Array<{answer: string, end?: number, score?: number, start?: number}>");
 	}
 
-	return res;
+	return res[0];
 }
diff --git a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
index 5e5767d161..f903675fd4 100644
--- a/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
+++ b/packages/inference/src/tasks/multimodal/visualQuestionAnswering.ts
@@ -18,7 +18,7 @@ export type VisualQuestionAnsweringArgs = BaseArgs &
 export async function visualQuestionAnswering(
 	args: VisualQuestionAnsweringArgs,
 	options?: Options
-): Promise<VisualQuestionAnsweringOutput> {
+): Promise<VisualQuestionAnsweringOutput[number]> {
 	const reqArgs: RequestArgs = {
 		...args,
 		inputs: {
@@ -39,5 +39,5 @@ export async function visualQuestionAnswering(
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
 	}
-	return res;
+	return res[0];
 }
diff --git a/packages/inference/src/tasks/nlp/questionAnswering.ts b/packages/inference/src/tasks/nlp/questionAnswering.ts
index 4fc63fa505..1007718e9a 100644
--- a/packages/inference/src/tasks/nlp/questionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/questionAnswering.ts
@@ -11,29 +11,29 @@ export type QuestionAnsweringArgs = BaseArgs & QuestionAnsweringInput;
 export async function questionAnswering(
 	args: QuestionAnsweringArgs,
 	options?: Options
-): Promise<QuestionAnsweringOutput> {
+): Promise<QuestionAnsweringOutput[number]> {
 	const res = await request<QuestionAnsweringOutput | QuestionAnsweringOutput[number]>(args, {
 		...options,
 		taskHint: "question-answering",
 	});
 	const isValidOutput = Array.isArray(res)
 		? res.every(
-				(elem) =>
-					typeof elem === "object" &&
-					!!elem &&
-					typeof elem.answer === "string" &&
-					typeof elem.end === "number" &&
-					typeof elem.score === "number" &&
-					typeof elem.start === "number"
-		  )
+			(elem) =>
+				typeof elem === "object" &&
+				!!elem &&
+				typeof elem.answer === "string" &&
+				typeof elem.end === "number" &&
+				typeof elem.score === "number" &&
+				typeof elem.start === "number"
+		)
 		: typeof res === "object" &&
-		  !!res &&
-		  typeof res.answer === "string" &&
-		  typeof res.end === "number" &&
-		  typeof res.score === "number" &&
-		  typeof res.start === "number";
+		!!res &&
+		typeof res.answer === "string" &&
+		typeof res.end === "number" &&
+		typeof res.score === "number" &&
+		typeof res.start === "number";
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
 	}
-	return Array.isArray(res) ? res : [res];
+	return Array.isArray(res) ? res[0] : res;
 }
diff --git a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
index 2d51d7e067..b38bdda5c8 100644
--- a/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/tableQuestionAnswering.ts
@@ -11,7 +11,7 @@ export type TableQuestionAnsweringArgs = BaseArgs & TableQuestionAnsweringInput;
 export async function tableQuestionAnswering(
 	args: TableQuestionAnsweringArgs,
 	options?: Options
-): Promise<TableQuestionAnsweringOutput> {
+): Promise<TableQuestionAnsweringOutput[number]> {
 	const res = await request<TableQuestionAnsweringOutput | TableQuestionAnsweringOutput[number]>(args, {
 		...options,
 		taskHint: "table-question-answering",
@@ -22,7 +22,7 @@ export async function tableQuestionAnswering(
 			"Expected {aggregator: string, answer: string, cells: string[], coordinates: number[][]}"
 		);
 	}
-	return Array.isArray(res) ? res : [res];
+	return Array.isArray(res) ? res[0] : res;
 }
 
 function validate(elem: unknown): elem is TableQuestionAnsweringOutput[number] {
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
index 1bb52077a4..b49a69c0aa 100644
--- a/packages/inference/test/HfInference.spec.ts
+++ b/packages/inference/test/HfInference.spec.ts
@@ -1,12 +1,6 @@
 import { expect, it, describe, assert } from "vitest";
 
-import type {
-	ChatCompletionStreamOutput,
-	DocumentQuestionAnsweringOutputElement,
-	QuestionAnsweringOutputElement,
-	TableQuestionAnsweringOutputElement,
-	VisualQuestionAnsweringOutputElement,
-} from "@huggingface/tasks";
+import type { ChatCompletionStreamOutput } from "@huggingface/tasks";
 
 import { chatCompletion, FAL_AI_SUPPORTED_MODEL_IDS, HfInference } from "../src";
 import "./vcr";
@@ -95,16 +89,12 @@ describe.concurrent("HfInference", () => {
 							context: "The capital of France is Paris.",
 						},
 					})
-				).toEqual(
-					expect.arrayContaining<QuestionAnsweringOutputElement>([
-						{
-							answer: expect.any(String),
-							score: expect.any(Number),
-							start: expect.any(Number),
-							end: expect.any(Number),
-						},
-					])
-				);
+				).toMatchObject({
+					answer: "Paris",
+					score: expect.any(Number),
+					start: expect.any(Number),
+					end: expect.any(Number),
+				});
 			});
 
 			it("tableQuestionAnswering", async () => {
@@ -121,16 +111,12 @@ describe.concurrent("HfInference", () => {
 							},
 						},
 					})
-				).toEqual(
-					expect.arrayContaining<TableQuestionAnsweringOutputElement>([
-						{
-							answer: expect.any(String),
-							coordinates: [expect.arrayContaining([expect.any(Number)])],
-							cells: expect.arrayContaining([expect.any(String)]),
-							aggregator: expect.any(String),
-						},
-					])
-				);
+				).toMatchObject({
+					answer: "AVERAGE > 36542",
+					coordinates: [[0, 1]],
+					cells: ["36542"],
+					aggregator: "AVERAGE",
+				});
 			});
 
 			it("documentQuestionAnswering", async () => {
@@ -142,17 +128,13 @@ describe.concurrent("HfInference", () => {
 							image: new Blob([readTestFile("invoice.png")], { type: "image/png" }),
 						},
 					})
-				).toEqual(
-					expect.arrayContaining<DocumentQuestionAnsweringOutputElement>([
-						{
-							answer: expect.any(String),
-							score: expect.any(Number),
-							// not sure what start/end refers to in this case
-							start: expect.any(Number),
-							end: expect.any(Number),
-						},
-					])
-				);
+				).toMatchObject({
+					answer: "us-001",
+					score: expect.any(Number),
+					// not sure what start/end refers to in this case
+					start: expect.any(Number),
+					end: expect.any(Number),
+				});
 			});
 
 			// Errors with "Error: If you are using a VisionEncoderDecoderModel, you must provide a feature extractor"
@@ -179,14 +161,10 @@ describe.concurrent("HfInference", () => {
 							image: new Blob([readTestFile("cats.png")], { type: "image/png" }),
 						},
 					})
-				).toEqual(
-					expect.arrayContaining<VisualQuestionAnsweringOutputElement>([
-						{
-							answer: expect.any(String),
-							score: expect.any(Number),
-						},
-					])
-				);
+				).toMatchObject({
+					answer: "2",
+					score: expect.any(Number),
+				});
 			});
 
 			it("textClassification", async () => {
@@ -461,11 +439,8 @@ describe.concurrent("HfInference", () => {
 					expect.arrayContaining([
 						expect.objectContaining({
 							label: expect.any(String),
-							audio: expect.any(Blob),
-						}),
-						expect.objectContaining({
-							label: expect.any(String),
-							audio: expect.any(Blob),
+							blob: expect.any(String),
+							"content-type": expect.any(String),
 						}),
 					])
 				);

From 25bbb7583dc62d255dd0a3fe3c2123c4957d1c55 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 23 Jan 2025 17:42:40 +0100
Subject: [PATCH 23/23] lint...

---
 .../inference/src/tasks/audio/audioToAudio.ts | 12 ++++-----
 .../src/tasks/nlp/questionAnswering.ts        | 26 +++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/packages/inference/src/tasks/audio/audioToAudio.ts b/packages/inference/src/tasks/audio/audioToAudio.ts
index 2f426b5205..a7b23b45dc 100644
--- a/packages/inference/src/tasks/audio/audioToAudio.ts
+++ b/packages/inference/src/tasks/audio/audioToAudio.ts
@@ -6,11 +6,11 @@ import { preparePayload } from "./utils";
 
 export type AudioToAudioArgs =
 	| (BaseArgs & {
-		/**
-		 * Binary audio data
-		 */
-		inputs: Blob;
-	})
+			/**
+			 * Binary audio data
+			 */
+			inputs: Blob;
+	  })
 	| LegacyAudioInput;
 
 export interface AudioToAudioOutputElem {
@@ -65,5 +65,5 @@ function validateOutput(output: unknown): AudioToAudioOutput[] {
 	) {
 		throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
 	}
-	return output
+	return output;
 }
diff --git a/packages/inference/src/tasks/nlp/questionAnswering.ts b/packages/inference/src/tasks/nlp/questionAnswering.ts
index 1007718e9a..fc16071110 100644
--- a/packages/inference/src/tasks/nlp/questionAnswering.ts
+++ b/packages/inference/src/tasks/nlp/questionAnswering.ts
@@ -18,20 +18,20 @@ export async function questionAnswering(
 	});
 	const isValidOutput = Array.isArray(res)
 		? res.every(
-			(elem) =>
-				typeof elem === "object" &&
-				!!elem &&
-				typeof elem.answer === "string" &&
-				typeof elem.end === "number" &&
-				typeof elem.score === "number" &&
-				typeof elem.start === "number"
-		)
+				(elem) =>
+					typeof elem === "object" &&
+					!!elem &&
+					typeof elem.answer === "string" &&
+					typeof elem.end === "number" &&
+					typeof elem.score === "number" &&
+					typeof elem.start === "number"
+		  )
 		: typeof res === "object" &&
-		!!res &&
-		typeof res.answer === "string" &&
-		typeof res.end === "number" &&
-		typeof res.score === "number" &&
-		typeof res.start === "number";
+		  !!res &&
+		  typeof res.answer === "string" &&
+		  typeof res.end === "number" &&
+		  typeof res.score === "number" &&
+		  typeof res.start === "number";
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
 	}