add vibevoice snippet

Deep-unlearning · Deep-unlearning · commit c1b03cf5c247 · 2025-09-18T10:34:37.000+02:00
diff --git a/packages/tasks/src/model-libraries-snippets.ts b/packages/tasks/src/model-libraries-snippets.ts
@@ -1684,6 +1684,29 @@ image = sana(
 ) `,
 ];
 
+export const vibevoice = (model: ModelData): string[] => [
+	`import torch, soundfile as sf, librosa, numpy as np
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+
+# Load voice sample (should be 24kHz mono)
+voice, sr = sf.read("path/to/voice_sample.wav")
+if voice.ndim > 1: voice = voice.mean(axis=1)
+if sr != 24000: voice = librosa.resample(voice, sr, 24000)
+
+processor = VibeVoiceProcessor.from_pretrained("${model.id}")
+model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+    "${model.id}", torch_dtype=torch.bfloat16
+).to("cuda").eval()
+model.set_ddpm_inference_steps(5)
+
+inputs = processor(text=["Speaker 0: Hello!\\nSpeaker 1: Hi there!"],
+                   voice_samples=[[voice]], return_tensors="pt")
+audio = model.generate(**inputs, cfg_scale=1.3,
+                       tokenizer=processor.tokenizer).speech_outputs[0]
+sf.write("output.wav", audio.cpu().numpy().squeeze(), 24000)`,
+];
+
 export const videoprism = (model: ModelData): string[] => [
 	`# Install from https://github.com/google-deepmind/videoprism
 import jax
diff --git a/packages/tasks/src/model-libraries.ts b/packages/tasks/src/model-libraries.ts
@@ -1170,6 +1170,14 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
 		countDownloads: `path_extension:"pt"`,
 		snippets: snippets.vui,
 	},
+	vibevoice: {
+		prettyLabel: "VibeVoice",
+		repoName: "VibeVoice",
+		repoUrl: "https://github.com/microsoft/VibeVoice",
+		snippets: snippets.vibevoice,
+		filter: false,
+		countDownloads: `path:"pytorch_model.bin" OR path:"model.safetensors"`,
+	},
 	"wan2.2": {
 		prettyLabel: "Wan2.2",
 		repoName: "Wan2.2",