RVC-Boss
diff --git a/‎.github/build_windows_packages.ps1‎
Lines changed: 6 additions & 1 deletion b/‎.github/build_windows_packages.ps1‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/build_windows_packages.yaml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/build_windows_packages.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎Docker/miniconda_install.sh‎
Lines changed: 20 additions & 2 deletions b/‎Docker/miniconda_install.sh‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎GPT_SoVITS/Accelerate/MLX/__init__.py‎
Lines changed: 11 additions & 0 deletions b/‎GPT_SoVITS/Accelerate/MLX/__init__.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎GPT_SoVITS/Accelerate/MLX/sample_funcs_mlx.py‎
Lines changed: 64 additions & 0 deletions b/‎GPT_SoVITS/Accelerate/MLX/sample_funcs_mlx.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎GPT_SoVITS/Accelerate/MLX/structs_mlx.py‎
Lines changed: 155 additions & 0 deletions b/‎GPT_SoVITS/Accelerate/MLX/structs_mlx.py‎
Lines changed: 155 additions & 0 deletions
@@ -115,12 +115,17 @@ Remove-Item $ffDir.FullName -Recurse -Force
 Write-Host "[INFO] Installing PyTorch..."
 & ".\runtime\python.exe" -m ensurepip
 & ".\runtime\python.exe" -m pip install --upgrade pip --no-warn-script-location
+
 switch ($cuda) {
     "cu124" {
-        & ".\runtime\python.exe" -m pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location
+        & ".\runtime\python.exe" -m pip install psutil ninja packaging wheel "setuptools>=42" --no-warn-script-location
+        & ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location
+        & ".\runtime\python.exe" -m pip install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
     }
     "cu128" {
+        & ".\runtime\python.exe" -m pip install psutil ninja packaging wheel "setuptools>=42" --no-warn-script-location
         & ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 --no-warn-script-location
+        & ".\runtime\python.exe" -m pip install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
     }
     default {
         Write-Error "Unsupported CUDA version: $cuda"
 
@@ -31,6 +31,15 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
+      - name: Install Windows CUDA 12.9
+        if: ${{ runner.os == 'Windows' && matrix.torch_cuda == '12.8' }}
+        uses: Jimver/cuda-toolkit
+        id: cuda-toolkit-win-129
+        with:
+          cuda: 12.9.1
+          method: "network"
+          sub-packages: '["nvcc", "cudart", "visual_studio_integration"]'
+
       - name: Run Build and Upload Script
         shell: pwsh
         run: |
 
@@ -23,8 +23,10 @@ fi
 
 if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
     "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh
+    SYSROOT_PKG="sysroot_linux-64>=2.28"
 elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then
     "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh
+    SYSROOT_PKG="sysroot_linux-aarch64>=2.28"
 else
     exit 1
 fi
@@ -45,20 +47,36 @@ rm miniconda.sh
 
 source "$HOME/miniconda3/etc/profile.d/conda.sh"
 
+"$HOME/miniconda3/bin/conda" init bash
+
+source "$HOME/.bashrc"
+
 "$HOME/miniconda3/bin/conda" config --add channels conda-forge
 
 "$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null
 
 "$HOME/miniconda3/bin/conda" install python=3.11 -q -y
 
-"$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y
+"$HOME/miniconda3/bin/conda" install gcc=11 gxx ffmpeg cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -q -y
 
 if [ "$CUDA_VERSION" = "12.8" ]; then
     "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
+    "$HOME/miniconda3/bin/conda" install cuda-nvcc=12.8 -c nvidia
 elif [ "$CUDA_VERSION" = "12.6" ]; then
-    "$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
+    "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
+    "$HOME/miniconda3/bin/conda" install cuda-nvcc=12.6 -c nvidia
 fi
 
+CUDA_PATH=$(echo "$HOME/miniconda3/targets/"*-linux | awk '{print $1}')
+
+export CUDA_HOME=$CUDA_PATH
+export PATH="$HOME/miniconda3/bin:$PATH"
+export PATH="$CUDA_HOME/bin:$PATH"
+export PATH="$CUDA_HOME/nvvm/bin:$PATH"
+
+"$HOME/miniconda3/bin/pip" install psutil ninja packaging wheel "setuptools>=42"
+"$HOME/miniconda3/bin/pip" install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
+
 "$HOME/miniconda3/bin/pip" cache purge
 
 rm $LOG_PATH
 
@@ -0,0 +1,11 @@
+import importlib.util
+
+if importlib.util.find_spec("mlx") is not None:
+    from .sample_funcs_mlx import sample_naive as sample_naive_mlx
+    from .t2s_engine_mlx import T2SEngine as T2SEngineMLX
+
+    backends = ["MLX"]
+else:
+    backends = []
+
+__all__ = ["T2SEngineMLX", "sample_naive_mlx", "backends"]
@@ -0,0 +1,64 @@
+from functools import partial
+from typing import Protocol, cast
+
+import mlx.core as mx
+
+Array = mx.array
+
+
+class SampleProtocolMLX(Protocol):
+    @staticmethod
+    def __call__(
+        logits: Array,
+        previous_tokens: Array,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+        repetition_penalty: float,
+    ) -> Array: ...
+
+
+class sample_naive(SampleProtocolMLX):
+    @partial(mx.compile, shapeless=True)
+    @staticmethod
+    def __call__(
+        logits,
+        previous_tokens,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+    ):
+        if temperature <= 1e-5:
+            probs = mx.softmax(logits, axis=-1)
+            return mx.argmax(probs, axis=-1, keepdims=True)
+
+        if repetition_penalty != 1.0:
+            batch_idx = mx.arange(cast(tuple[int, ...], previous_tokens.shape)[0])
+            previous_tokens = previous_tokens.astype(mx.int64)
+            selected_logists = logits[batch_idx, previous_tokens]
+            selected_logists = mx.where(
+                selected_logists < 0, selected_logists * repetition_penalty, selected_logists / repetition_penalty
+            )
+            logits[batch_idx, previous_tokens] = selected_logists
+
+        sorted_indices = mx.argsort(-logits, axis=-1)
+        sorted_logits = mx.take_along_axis(logits, sorted_indices, axis=-1)
+        cum_probs = mx.cumsum(mx.softmax(sorted_logits, axis=-1), axis=-1)
+        sorted_indices_to_remove = cum_probs > top_p
+        sorted_indices_to_remove[:, 0] = False
+        indices_to_remove = mx.zeros_like(logits).astype(mx.bool_)
+        batch_indices = mx.arange(cast(tuple[int, ...], logits.shape)[0])[:, None]
+        indices_to_remove[batch_indices, sorted_indices] = sorted_indices_to_remove
+        logits = mx.where(indices_to_remove, -mx.inf, logits)
+
+        logits = logits / temperature
+
+        v = mx.topk(logits, top_k)
+        pivot = mx.expand_dims(v[:, -1], -1)
+        logits = mx.where(logits < pivot, -mx.inf, logits)
+
+        gumbel_noise = mx.random.gumbel(shape=cast(tuple[int, ...], logits.shape), dtype=logits.dtype)
+        idx_next = mx.argmax(logits + gumbel_noise, axis=-1, keepdims=True).astype(mx.int32)
+
+        return idx_next
@@ -0,0 +1,155 @@
+"""
+Modified From https://github.com/XXXXRT666/GPT-SoVITS
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Callable, List, MutableSequence, Protocol, Type, cast
+
+import mlx.core as mx
+import torch
+
+from ..PyTorch.structs import T2SRequest, T2SResult
+from .sample_funcs_mlx import SampleProtocolMLX, sample_naive
+
+Tensor = torch.Tensor
+Array = mx.array
+
+
+@dataclass(slots=True)
+class T2SRequestMLX:
+    x: List[Array]
+    x_lens: Array
+    prompts: Array
+    bert_feature: List[Array]
+    valid_length: int
+    top_k: int = 5
+    top_p: float = 1
+    early_stop_num: int = -1
+    temperature: float = 1.0
+    repetition_penalty: float = 1.35
+
+    @classmethod
+    def from_torch(cls, request: T2SRequest) -> T2SRequestMLX:
+        x = list(map(lambda tensor: mx.array(tensor.cpu()), request.x))
+        x_lens = mx.array(request.x_lens.cpu())
+        prompts = mx.array(request.prompts.cpu())
+        bert_feature = list(map(lambda tensor: mx.array(tensor.cpu()), request.bert_feature))
+
+        return cls(
+            x,
+            x_lens,
+            prompts,
+            bert_feature,
+            request.valid_length,
+            request.top_k,
+            request.top_p,
+            request.early_stop_num,
+            request.temperature,
+            request.repetition_penalty,
+        )
+
+
+class KVCacheProtocol(Protocol):
+    k_cache: Array
+    v_cache: Array
+
+    def empty(self) -> None: ...
+
+    def update_cache(self, input_pos: Array, k_val: Array, v_val: Array, *args, **kwds) -> tuple[Array, Array]: ...
+
+    def prefill_kv(self, k_val: Array, v_val: Array) -> None: ...
+
+    def sync_cache(self, kv_cache: KVCacheProtocol) -> None: ...
+
+
+class T2SDecoderProtocol(Protocol):
+    max_seq_length: int
+    EOS: int
+    n_head: int
+
+    def embed(self, x: list[Array], y: Array, bert_features: list[Array]) -> Array: ...
+
+
+class T2SEngineProtocol(Protocol):
+    def _handle_request(self, request: T2SRequest) -> tuple[list[Array], float]: ...
+
+    def generate(self, request: T2SRequest) -> T2SResult: ...
+
+    @staticmethod
+    def load_decoder(
+        weights_path: os.PathLike, max_batch_size: int = 1, implement: str = "MLX"
+    ) -> T2SDecoderProtocol: ...
+
+
+class T2SSessionMLX:
+    def __init__(
+        self,
+        decoder: T2SDecoderProtocol,
+        request_torch: T2SRequest,
+        sample_func: Type[SampleProtocolMLX] = sample_naive,
+        device: mx.Device = mx.Device(mx.cpu),
+        dtype: mx.Dtype = mx.float32,
+    ):
+        with mx.stream(device):
+            request = T2SRequestMLX.from_torch(request_torch)
+
+            self.decoder = decoder
+            self.request = request
+            self.device = device
+            self.dtype = dtype
+
+            bsz = len(request.x)
+            y_len: int = cast(tuple[int, ...], request.prompts.shape)[-1]
+            self.bsz = bsz
+            self.y_len = y_len
+
+            # Cache
+            self.kv_cache: MutableSequence[KVCacheProtocol]
+            self.sample = sample_func()
+
+            # Forward args
+            self.x = [i.astype(mx.int32) for i in request.x]
+            self.x_lens = request.x_lens.astype(mx.int32)
+            self.y = mx.zeros((bsz, decoder.max_seq_length)).astype(mx.int32)
+            self.y[:, : cast(tuple[int, ...], request.prompts.shape)[-1]] = request.prompts.astype(mx.int32)
+            self.bert_feature = [i.astype(dtype) for i in request.bert_feature]
+
+            self.prefill_len = self.x_lens + cast(tuple[int, ...], request.prompts.shape)[1]
+
+            self.input_pos = mx.zeros_like(self.prefill_len)
+            self.input_pos += self.prefill_len
+
+            # EOS
+            self.completed = mx.array([False] * len(self.x)).astype(mx.bool_)
+            self.y_results: List[Array] = [None] * len(self.x)  # type: ignore
+
+            self.xy_pos = decoder.embed(self.x, request.prompts, self.bert_feature)
+
+            max_len = int(self.prefill_len.max(-1))
+            attn_mask = mx.zeros(shape=(bsz, max_len, max_len), dtype=mx.bool_)
+
+            for bs in range(bsz):
+                pos = int(self.x_lens[bs])
+                seq_len = pos + y_len
+
+                attn_mask[bs, :seq_len, :pos] = True
+
+                ar_mask = ~mx.triu(
+                    x=mx.ones(
+                        shape=(
+                            y_len,
+                            y_len,
+                        ),
+                        dtype=mx.bool_,
+                    ),
+                    k=1,
+                )
+                attn_mask[bs, pos:seq_len, pos:seq_len] = ar_mask
+
+            attn_mask = mx.repeat(mx.expand_dims(attn_mask, 1), decoder.n_head, 1)
+            self.attn_mask = attn_mask
+
+            mx.eval(self.attn_mask)