Merge pull request #1 from charnesp/whisperx

charnesp · web-flow · commit 2538c5ffc446 · 2024-12-19T21:33:48.000+01:00
Whisperx integration
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,44 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-docker-compose
+{
+	"name": "Existing Docker Compose (Extend)",
+
+	// Update the 'dockerComposeFile' list if you have more compose files or use different names.
+	// The .devcontainer/docker-compose.yml file contains any overrides you need/want to make.
+	"dockerComposeFile": [
+		"../docker-compose.yml",
+		"docker-compose.yml"
+	],
+
+	// The 'service' property is the name of the service for the container that VS Code should
+	// use. Update this value and .devcontainer/docker-compose.yml to the real service name.
+	"service": "whisper-asr-webservice",
+
+	// The optional 'workspaceFolder' property is the path VS Code should open by default when
+	// connected. This is typically a file mount in .devcontainer/docker-compose.yml
+	"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",
+
+	// "overrideCommand": "/bin/sh -c 'while sleep 1000; do :; done'"
+	"overrideCommand": true
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Uncomment the next line if you want start specific services in your Docker Compose config.
+	// "runServices": [],
+
+	// Uncomment the next line if you want to keep your containers running after VS Code shuts down.
+	// "shutdownAction": "none",
+
+	// Uncomment the next line to run commands after the container is created.
+	// "postCreateCommand": "cat /etc/os-release",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "devcontainer"
+}
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
@@ -0,0 +1,30 @@
+version: '3.4'
+services:
+  # Update this to the name of the service you want to work with in your docker-compose.yml file
+  whisper-asr-webservice:
+    # Uncomment if you want to override the service's Dockerfile to one in the .devcontainer 
+    # folder. Note that the path of the Dockerfile and context is relative to the *primary* 
+    # docker-compose.yml file (the first in the devcontainer.json "dockerComposeFile"
+    # array). The sample below assumes your primary file is in the root of your project.
+    #
+    # build:
+    #   context: .
+    #   dockerfile: .devcontainer/Dockerfile
+    env_file: .devcontainer/dev.env
+    environment:
+      ASR_ENGINE: ${ASR_ENGINE}
+      HF_TOKEN: ${HF_TOKEN}
+
+    volumes:
+      # Update this to wherever you want VS Code to mount the folder of your project
+      - ..:/workspaces:cached
+
+    # Uncomment the next four lines if you will use a ptrace-based debugger like C++, Go, and Rust.
+    # cap_add:
+    #   - SYS_PTRACE
+    # security_opt:
+    #   - seccomp:unconfined
+
+    # Overrides default command so things don't shut down after the process ends.
+    command: sleep infinity
+ 
diff --git a/.gitignore b/.gitignore
@@ -41,4 +41,6 @@ pip-wheel-metadata
 
 poetry/core/*
 
-public
+public
+
+.devcontainer/dev.env
diff --git a/Dockerfile b/Dockerfile
@@ -8,6 +8,8 @@ RUN export DEBIAN_FRONTEND=noninteractive \
     pkg-config \
     yasm \
     ca-certificates \
+    gcc \
+    python3-dev \
     && rm -rf /var/lib/apt/lists/*
 
 RUN git clone https://github.com/FFmpeg/FFmpeg.git --depth 1 --branch n6.1.1 --single-branch /FFmpeg-6.1.1
@@ -42,6 +44,12 @@ FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
 
 FROM python:3.10-bookworm
 
+RUN export DEBIAN_FRONTEND=noninteractive \
+    && apt-get -qq update \
+    && apt-get -qq install --no-install-recommends \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+
 ENV POETRY_VENV=/app/.venv
 
 RUN python3 -m venv $POETRY_VENV \
@@ -61,6 +69,11 @@ COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-ass
 RUN poetry config virtualenvs.in-project true
 RUN poetry install
 
+RUN $POETRY_VENV/bin/pip install pandas transformers nltk pyannote.audio
+RUN git clone --depth 1 https://github.com/m-bain/whisperX.git \
+    && cd whisperX \
+    && $POETRY_VENV/bin/pip install -e .
+
 EXPOSE 9000
 
 ENTRYPOINT ["whisper-asr-webservice"]
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -43,6 +43,13 @@ FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
 
 ENV PYTHON_VERSION=3.10
+
+RUN export DEBIAN_FRONTEND=noninteractive \
+    && apt-get -qq update \
+    && apt-get -qq install --no-install-recommends \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+
 ENV POETRY_VENV=/app/.venv
 
 RUN export DEBIAN_FRONTEND=noninteractive \
@@ -79,6 +86,11 @@ COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-ass
 RUN poetry install
 RUN $POETRY_VENV/bin/pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch
 
+RUN $POETRY_VENV/bin/pip install pandas transformers nltk pyannote.audio
+RUN git clone --depth 1 https://github.com/m-bain/whisperX.git \
+    && cd whisperX \
+    && $POETRY_VENV/bin/pip install -e .
+
 EXPOSE 9000
 
 CMD whisper-asr-webservice
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Current release (v1.7.1) supports following whisper models:
 
 - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
 - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0)
+- [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1)
 
 ## Quick Usage
 
diff --git a/app/asr_models/asr_model.py b/app/asr_models/asr_model.py
@@ -13,7 +13,10 @@ class ASRModel(ABC):
     """
     Abstract base class for ASR (Automatic Speech Recognition) models.
     """
+
     model = None
+    diarize_model = None  # used for WhisperX
+    x_models = dict()  # used for WhisperX
     model_lock = Lock()
     last_activity_time = time.time()
 
@@ -28,14 +31,17 @@ def load_model(self):
         pass
 
     @abstractmethod
-    def transcribe(self,
-                   audio,
-                   task: Union[str, None],
-                   language: Union[str, None],
-                   initial_prompt: Union[str, None],
-                   vad_filter: Union[bool, None],
-                   word_timestamps: Union[bool, None]
-                   ):
+    def transcribe(
+        self,
+        audio,
+        task: Union[str, None],
+        language: Union[str, None],
+        initial_prompt: Union[str, None],
+        vad_filter: Union[bool, None],
+        word_timestamps: Union[bool, None],
+        options: Union[dict, None],
+        output,
+    ):
         """
         Perform transcription on the given audio file.
         """
@@ -52,7 +58,8 @@ def monitor_idleness(self):
         """
         Monitors the idleness of the ASR model and releases the model if it has been idle for too long.
         """
-        if CONFIG.MODEL_IDLE_TIMEOUT <= 0: return
+        if CONFIG.MODEL_IDLE_TIMEOUT <= 0:
+            return
         while True:
             time.sleep(15)
             if time.time() - self.last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT:
@@ -68,4 +75,6 @@ def release_model(self):
         torch.cuda.empty_cache()
         gc.collect()
         self.model = None
+        self.diarize_model = None
+        self.x_models = dict()
         print("Model unloaded due to timeout")
diff --git a/app/asr_models/faster_whisper_engine.py b/app/asr_models/faster_whisper_engine.py
@@ -32,6 +32,7 @@ def transcribe(
             initial_prompt: Union[str, None],
             vad_filter: Union[bool, None],
             word_timestamps: Union[bool, None],
+            options: Union[dict, None],
             output,
     ):
         self.last_activity_time = time.time()
diff --git a/app/asr_models/mbain_whisperx_engine.py b/app/asr_models/mbain_whisperx_engine.py
@@ -0,0 +1,111 @@
+from typing import BinaryIO, Union
+from io import StringIO
+import whisperx
+import whisper
+from whisperx.utils import SubtitlesWriter, ResultWriter
+
+from app.asr_models.asr_model import ASRModel
+from app.config import CONFIG
+from app.utils import WriteTXT, WriteSRT, WriteVTT, WriteTSV, WriteJSON
+
+
+class WhisperXASR(ASRModel):
+    def __init__(self):
+        self.x_models = dict()
+
+    def load_model(self):
+
+        asr_options = {"without_timestamps": False}
+        self.model = whisperx.load_model(
+            CONFIG.MODEL_NAME, device=CONFIG.DEVICE, compute_type="float32", asr_options=asr_options
+        )
+
+        if CONFIG.HF_TOKEN != "":
+            self.diarize_model = whisperx.DiarizationPipeline(use_auth_token=CONFIG.HF_TOKEN, device=CONFIG.DEVICE)
+
+    def transcribe(
+        self,
+        audio,
+        task: Union[str, None],
+        language: Union[str, None],
+        initial_prompt: Union[str, None],
+        vad_filter: Union[bool, None],
+        word_timestamps: Union[bool, None],
+        options: Union[dict, None],
+        output,
+    ):
+        options_dict = {"task": task}
+        if language:
+            options_dict["language"] = language
+        if initial_prompt:
+            options_dict["initial_prompt"] = initial_prompt
+        with self.model_lock:
+            if self.model is None:
+                self.load_model()
+            result = self.model.transcribe(audio, **options_dict)
+
+        # Load the required model and cache it
+        # If we transcribe models in many different languages, this may lead to OOM propblems
+        if result["language"] in self.x_models:
+            model_x, metadata = self.x_models[result["language"]]
+        else:
+            self.x_models[result["language"]] = whisperx.load_align_model(
+                language_code=result["language"], device=CONFIG.DEVICE
+            )
+            model_x, metadata = self.x_models[result["language"]]
+
+        # Align whisper output
+        result = whisperx.align(
+            result["segments"], model_x, metadata, audio, CONFIG.DEVICE, return_char_alignments=False
+        )
+
+        if options.get("diarize", False):
+            if CONFIG.HF_TOKEN == "":
+                print("Warning! HF_TOKEN is not set. Diarization may not work as expected.")
+            min_speakers = options.get("min_speakers", None)
+            max_speakers = options.get("max_speakers", None)
+            # add min/max number of speakers if known
+            diarize_segments = self.diarize_model(audio, min_speakers, max_speakers)
+            result = whisperx.assign_word_speakers(diarize_segments, result)
+
+        output_file = StringIO()
+        self.write_result(result, output_file, output)
+        output_file.seek(0)
+
+        return output_file
+
+    def language_detection(self, audio):
+        # load audio and pad/trim it to fit 30 seconds
+        audio = whisper.pad_or_trim(audio)
+
+        # make log-Mel spectrogram and move to the same device as the model
+        mel = whisper.log_mel_spectrogram(audio).to(self.model.device)
+
+        # detect the spoken language
+        with self.model_lock:
+            if self.model is None:
+                self.load_model()
+            _, probs = self.model.detect_language(mel)
+        detected_lang_code = max(probs, key=probs.get)
+
+        return detected_lang_code
+
+    def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
+        if output == "srt":
+            if CONFIG.HF_TOKEN != "":
+                WriteSRT(SubtitlesWriter).write_result(result, file=file, options={})
+            else:
+                WriteSRT(ResultWriter).write_result(result, file=file, options={})
+        elif output == "vtt":
+            if CONFIG.HF_TOKEN != "":
+                WriteVTT(SubtitlesWriter).write_result(result, file=file, options={})
+            else:
+                WriteVTT(ResultWriter).write_result(result, file=file, options={})
+        elif output == "tsv":
+            WriteTSV(ResultWriter).write_result(result, file=file, options={})
+        elif output == "json":
+            WriteJSON(ResultWriter).write_result(result, file=file, options={})
+        elif output == "txt":
+            WriteTXT(ResultWriter).write_result(result, file=file, options={})
+        else:
+            return 'Please select an output method!'
diff --git a/app/asr_models/openai_whisper_engine.py b/app/asr_models/openai_whisper_engine.py
@@ -16,32 +16,28 @@ class OpenAIWhisperASR(ASRModel):
     def load_model(self):
 
         if torch.cuda.is_available():
-            self.model = whisper.load_model(
-                name=CONFIG.MODEL_NAME,
-                download_root=CONFIG.MODEL_PATH
-            ).cuda()
+            self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH).cuda()
         else:
-            self.model = whisper.load_model(
-                name=CONFIG.MODEL_NAME,
-                download_root=CONFIG.MODEL_PATH
-            )
+            self.model = whisper.load_model(name=CONFIG.MODEL_NAME, download_root=CONFIG.MODEL_PATH)
 
         Thread(target=self.monitor_idleness, daemon=True).start()
 
     def transcribe(
-            self,
-            audio,
-            task: Union[str, None],
-            language: Union[str, None],
-            initial_prompt: Union[str, None],
-            vad_filter: Union[bool, None],
-            word_timestamps: Union[bool, None],
-            output,
+        self,
+        audio,
+        task: Union[str, None],
+        language: Union[str, None],
+        initial_prompt: Union[str, None],
+        vad_filter: Union[bool, None],
+        word_timestamps: Union[bool, None],
+        options: Union[dict, None],
+        output,
     ):
         self.last_activity_time = time.time()
 
         with self.model_lock:
-            if self.model is None: self.load_model()
+            if self.model is None:
+                self.load_model()
 
         options_dict = {"task": task}
         if language:
@@ -64,7 +60,8 @@ def language_detection(self, audio):
         self.last_activity_time = time.time()
 
         with self.model_lock:
-            if self.model is None: self.load_model()
+            if self.model is None:
+                self.load_model()
 
         # load audio and pad/trim it to fit 30 seconds
         audio = whisper.pad_or_trim(audio)
diff --git a/app/config.py b/app/config.py
@@ -11,6 +11,9 @@ class CONFIG:
     # Determine the ASR engine ('faster_whisper' or 'openai_whisper')
     ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper")
 
+    # Retrieve Huggingface Token
+    HF_TOKEN = os.getenv("HF_TOKEN", "")
+
     # Determine the computation device (GPU or CPU)
     DEVICE = os.getenv("ASR_DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
 
diff --git a/app/factory/asr_model_factory.py b/app/factory/asr_model_factory.py
diff --git a/app/utils.py b/app/utils.py
diff --git a/app/webservice.py b/app/webservice.py