From e560196e0b62c460edce7ddc2c7bfdce97fcdb87 Mon Sep 17 00:00:00 2001 From: Pedro Luis Cuevas Villarrubia Date: Tue, 24 Mar 2026 18:35:18 +0100 Subject: [PATCH 1/5] feat: add TTS voice message responses via edge-tts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add text-to-speech support using Microsoft Edge neural voices. Final assistant responses are sent as Telegram voice notes alongside text. Features: - /voice — Toggle TTS on/off (per-user) - /voice — Set voice and auto-enable TTS - /voices — Compact locale index with voice counts - /voices — List all voices for a locale (es, en, zh...) - Per-user voice selection with global defaults - Graceful 503 handling for Microsoft service outages - Smart /voices vs /voice confusion detection Config (env vars): - CCBOT_TTS_ENABLED (default: true) - CCBOT_TTS_AUTO (default: false) - CCBOT_TTS_VOICE (default: es-ES-ElviraNeural) Co-Authored-By: Claude Opus 4.6 --- .claude/rules/architecture.md | 5 +- CLAUDE.md | 4 +- README.md | 44 ++++++++- pyproject.toml | 2 + src/ccbot/bot.py | 138 +++++++++++++++++++++++++++- src/ccbot/config.py | 25 ++++- src/ccbot/handlers/message_queue.py | 23 ++++- src/ccbot/tts.py | 121 ++++++++++++++++++++++++ tests/ccbot/test_tts.py | 64 +++++++++++++ 9 files changed, 415 insertions(+), 11 deletions(-) create mode 100644 src/ccbot/tts.py create mode 100644 tests/ccbot/test_tts.py diff --git a/.claude/rules/architecture.md b/.claude/rules/architecture.md index af8de6a5..816d7eaf 100644 --- a/.claude/rules/architecture.md +++ b/.claude/rules/architecture.md @@ -69,7 +69,8 @@ Additional modules: screenshot.py ─ Terminal text → PNG rendering (ANSI color, font fallback) - transcribe.py ─ Voice-to-text transcription via OpenAI API (gpt-4o-transcribe) + transcribe.py ─ Voice-to-text: local Whisper (faster-whisper + CTranslate2 + CUDA) + OpenAI API fallback + tts.py ─ Text-to-speech: edge-tts (Microsoft Edge neural voices) → OGG voice messages to Telegram main.py ─ CLI entry point utils.py ─ Shared utilities (ccbot_dir, atomic_write_json) @@ -97,6 +98,8 @@ State files (~/.ccbot/ or $CCBOT_DIR/): - **Tool use ↔ tool result pairing** — `tool_use_id` tracked across poll cycles; tool result edits the original tool_use Telegram message in-place. - **MarkdownV2 with fallback** — All messages go through `safe_reply`/`safe_edit`/`safe_send` which convert via `telegramify-markdown` and fall back to plain text on parse failure. - **No truncation at parse layer** — Full content preserved; splitting at send layer respects Telegram's 4096 char limit with expandable quote atomicity. +- **Local STT with API fallback** — Voice messages transcribed via faster-whisper (CTranslate2 + CUDA, model loaded lazily and resident). Falls back to OpenAI gpt-4o-transcribe API on failure if `OPENAI_API_KEY` is set. Engine selection via `CCBOT_STT_ENGINE` env var. +- **TTS voice responses** — Final assistant messages sent as Telegram voice notes via edge-tts (Microsoft Edge neural voices). Per-user toggle via `/voice` command. Text always sent first; audio appended after. Configurable voice and global auto-enable via `CCBOT_TTS_VOICE` / `CCBOT_TTS_AUTO`. - Only sessions registered in `session_map.json` (via hook) are monitored. - Notifications delivered to users via thread bindings (topic → window_id → session). - **Startup re-resolution** — Window IDs reset on tmux server restart. On startup, `resolve_stale_ids()` matches persisted display names against live windows to re-map IDs. Old state.json files keyed by window name are auto-migrated. diff --git a/CLAUDE.md b/CLAUDE.md index b9d91576..8073df99 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ ccmux — Telegram bot that bridges Telegram Forum topics to Claude Code sessions via tmux windows. Each topic is bound to one tmux window running one Claude Code instance. -Tech stack: Python, python-telegram-bot, tmux, uv. +Tech stack: Python, python-telegram-bot, tmux, uv, faster-whisper (CTranslate2 + CUDA), edge-tts (TTS). ## Common Commands @@ -23,6 +23,8 @@ ccbot hook --install # Auto-install Claude Code SessionStart ho - **Hook-based session tracking** — `SessionStart` hook writes `session_map.json`; monitor polls it to detect session changes. - **Message queue per user** — FIFO ordering, message merging (3800 char limit), tool_use/tool_result pairing. - **Rate limiting** — `AIORateLimiter(max_retries=5)` on the Application (30/s global). On restart, the global bucket is pre-filled to avoid burst against Telegram's server-side counter. +- **Local STT** — Voice messages transcribed via faster-whisper (CTranslate2 + CUDA) by default. OpenAI API as fallback. Model loaded lazily on first voice message, stays resident. +- **TTS** — Responses sent as Telegram voice messages via edge-tts (Microsoft Edge neural voices). Per-user toggle via `/voice` command. Configurable voice and auto-enable via env vars. ## Code Conventions diff --git a/README.md b/README.md index e7ee01dc..47503c1e 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ In fact, CCBot itself was built this way — iterating on itself through Claude - **Topic-based sessions** — Each Telegram topic maps 1:1 to a tmux window and Claude session - **Real-time notifications** — Get Telegram messages for assistant responses, thinking content, tool use/result, and local command output - **Interactive UI** — Navigate AskUserQuestion, ExitPlanMode, and Permission Prompts via inline keyboard -- **Voice messages** — Voice messages are transcribed via OpenAI and forwarded as text +- **Voice messages** — Voice messages are transcribed locally via Whisper (faster-whisper + CUDA) and forwarded as text. OpenAI API available as fallback. - **Send messages** — Forward text to Claude Code via tmux keystrokes - **Slash command forwarding** — Send any `/command` directly to Claude Code (e.g. `/clear`, `/compact`, `/cost`) - **Create new sessions** — Start Claude Code sessions from Telegram via directory browser @@ -95,8 +95,15 @@ ALLOWED_USERS=your_telegram_user_id | `CLAUDE_COMMAND` | `claude` | Command to run in new windows | | `MONITOR_POLL_INTERVAL` | `2.0` | Polling interval in seconds | | `CCBOT_SHOW_HIDDEN_DIRS` | `false` | Show hidden (dot) directories in directory browser | -| `OPENAI_API_KEY` | _(none)_ | OpenAI API key for voice message transcription | +| `CCBOT_STT_ENGINE` | `whisper` | STT engine: `whisper` (local, CUDA) or `openai` (API) | +| `CCBOT_WHISPER_MODEL` | `large-v3` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v3`, `large-v3-turbo`) | +| `CCBOT_WHISPER_DEVICE` | `cuda` | Compute device: `cuda` or `cpu` | +| `CCBOT_WHISPER_COMPUTE_TYPE` | `float16` | Compute precision: `float16` (GPU), `int8` (GPU, less VRAM), `int8_float16` (balanced) | +| `OPENAI_API_KEY` | _(none)_ | OpenAI API key (used when `CCBOT_STT_ENGINE=openai` or as whisper fallback) | | `OPENAI_BASE_URL` | `https://api.openai.com/v1` | OpenAI API base URL (for proxies or compatible APIs) | +| `CCBOT_TTS_ENABLED` | `true` | Enable TTS (text-to-speech) voice message responses | +| `CCBOT_TTS_AUTO` | `false` | Auto-enable TTS for all users (per-user toggle via `/voice`) | +| `CCBOT_TTS_VOICE` | `es-ES-ElviraNeural` | Edge TTS voice name (run `edge-tts --list-voices` for options) | Message formatting is always HTML via `chatgpt-md-converter` (`chatgpt_md_converter` package). There is no runtime formatter switch to MarkdownV2. @@ -151,6 +158,8 @@ uv run ccbot | `/history` | Message history for this topic | | `/screenshot` | Capture terminal screenshot | | `/esc` | Send Escape to interrupt Claude | +| `/voice` | Toggle TTS voice message responses | +| `/unbind` | Unbind topic from session (window stays alive) | **Claude Code commands (forwarded via tmux):** @@ -178,7 +187,34 @@ Any unrecognized `/command` is also forwarded to Claude Code as-is (e.g. `/revie **Sending messages:** -Once a topic is bound to a session, just send text or voice messages in that topic — text gets forwarded to Claude Code via tmux keystrokes, and voice messages are automatically transcribed and forwarded as text. +Once a topic is bound to a session, just send text or voice messages in that topic — text gets forwarded to Claude Code via tmux keystrokes, and voice messages are automatically transcribed (locally via Whisper by default) and forwarded as text. + +### Voice Messages (STT) + +CCBot uses [faster-whisper](https://github.com/Sybren/faster-whisper) with CTranslate2 for **local, GPU-accelerated** speech-to-text. No API key required. + +**How it works:** +1. You send a voice message in a Telegram topic +2. The bot downloads the OGG audio (in-memory, never written to disk permanently) +3. faster-whisper transcribes it on the local GPU (CUDA) +4. The transcribed text is forwarded to Claude Code via tmux + +**Supported models** (set via `CCBOT_WHISPER_MODEL`): + +| Model | Params | VRAM (float16) | Speed | Accuracy | +|-------|--------|----------------|-------|----------| +| `tiny` | 39M | ~1 GB | Fastest | Basic | +| `base` | 74M | ~1 GB | Very fast | Good | +| `small` | 244M | ~2 GB | Fast | Good | +| `medium` | 769M | ~5 GB | Moderate | Very good | +| `large-v3` | 1550M | ~10 GB | Moderate | Best | +| `large-v3-turbo` | 809M | ~3 GB | Fast | Near-best | + +The default `large-v3` provides the best accuracy. Use `large-v3-turbo` for a good balance of speed and accuracy with less VRAM usage. The model is downloaded once from HuggingFace Hub and cached locally. + +**Fallback:** If local Whisper fails and `OPENAI_API_KEY` is set, CCBot automatically falls back to OpenAI's `gpt-4o-transcribe` API. + +**VRAM note:** The Whisper model stays loaded in GPU memory after the first voice message. This uses ~3-4 GB VRAM with `large-v3` at `float16`. If GPU memory is limited, use a smaller model or `int8` compute type. **Killing a session:** @@ -261,7 +297,7 @@ src/ccbot/ ├── terminal_parser.py # Terminal pane parsing (interactive UI + status line) ├── html_converter.py # Markdown → Telegram HTML conversion + HTML-aware splitting ├── screenshot.py # Terminal text → PNG image with ANSI color support -├── transcribe.py # Voice-to-text transcription via OpenAI API +├── transcribe.py # Voice-to-text: local Whisper (CTranslate2+CUDA) + OpenAI fallback ├── utils.py # Shared utilities (atomic JSON writes, JSONL helpers) ├── tmux_manager.py # Tmux window management (list, create, send keys, kill) ├── fonts/ # Bundled fonts for screenshot rendering diff --git a/pyproject.toml b/pyproject.toml index 0d476088..43aba3c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,8 @@ dependencies = [ "Pillow>=10.0.0", "aiofiles>=24.0.0", "telegramify-markdown>=0.5.0,<1.0.0", + "faster-whisper>=1.2.1", + "edge-tts>=7.2.8", ] [project.scripts] diff --git a/src/ccbot/bot.py b/src/ccbot/bot.py index f8270732..4e7e40d1 100644 --- a/src/ccbot/bot.py +++ b/src/ccbot/bot.py @@ -136,6 +136,7 @@ from .tmux_manager import tmux_manager from .transcribe import close_client as close_transcribe_client from .transcribe import transcribe_voice +from .tts import get_voice, is_tts_enabled, set_voice, toggle_tts from .utils import ccbot_dir logger = logging.getLogger(__name__) @@ -277,6 +278,130 @@ async def unbind_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> ) +async def voice_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: + """Toggle TTS or change voice. + + Usage: + /voice — Toggle TTS on/off + /voice — Set voice (e.g. /voice es-AR-ElenaNeural) + """ + user = update.effective_user + if not user or not is_user_allowed(user.id): + return + if not update.message: + return + + if not config.tts_enabled: + await safe_reply(update.message, "❌ TTS is disabled globally (CCBOT_TTS_ENABLED=false).") + return + + args = context.args if context.args else [] + + # /voice — set voice (auto-enable TTS) + if args: + voice_name = args[0] + set_voice(user.id, voice_name) + if not is_tts_enabled(user.id): + toggle_tts(user.id) + await safe_reply( + update.message, + f"🔊 Voice set to `{voice_name}` — TTS ON\n" + "Use /voices to see available options.", + ) + return + + # /voice — toggle + new_state = toggle_tts(user.id) + status = "ON" if new_state else "OFF" + voice_name = get_voice(user.id) + await safe_reply( + update.message, + f"🔊 TTS {status} (voice: {voice_name})\n" + "Use /voice to change voice, /voices to list options.", + ) + + +async def voices_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: + """List available TTS voices. + + Usage: + /voices — Compact index of all locales with voice counts + /voices — All voices for a locale (e.g. /voices es, /voices en) + """ + user = update.effective_user + if not user or not is_user_allowed(user.id): + return + if not update.message: + return + + args = context.args if context.args else [] + locale_filter = args[0].lower() if args else "" + + try: + import edge_tts + + all_voices = await edge_tts.list_voices() + + if locale_filter: + # Detect if user used /voices instead of /voice to set a voice + if any(c.isupper() for c in locale_filter): + await safe_reply( + update.message, + f"💡 Did you mean `/voice {locale_filter}`?\n\n" + "/voice — Set a voice (also toggles TTS on)\n" + "/voices — List available voices", + ) + return + + filtered = [v for v in all_voices if v["Locale"].lower().startswith(locale_filter)] + if not filtered: + await safe_reply( + update.message, + f"❌ No voices found for '{locale_filter}'.\n" + "Use /voices to see available locales.", + ) + return + lines = [] + current = get_voice(user.id) + for v in sorted(filtered, key=lambda x: (x["Locale"], x["ShortName"])): + gender = "♂" if v["Gender"] == "Male" else "♀" + tag = " ★" if v["ShortName"] == current else "" + lines.append(f"{gender} `{v['ShortName']}` — {v['Locale']}{tag}") + header = f"🗣 {locale_filter} — {len(lines)} voices\n\n" + else: + from collections import Counter + + locale_counts = Counter(v["Locale"] for v in all_voices) + locale_flags = { + "ar": "🇸🇦", "bg": "🇧🇬", "cs": "🇨🇿", "da": "🇩🇰", "de": "🇩🇪", + "el": "🇬🇷", "en": "🇬🇧", "es": "🇪🇸", "et": "🇪🇪", "fi": "🇫🇮", + "fr": "🇫🇷", "he": "🇮🇱", "hi": "🇮🇳", "hr": "🇭🇷", "hu": "🇭🇺", + "id": "🇮🇩", "it": "🇮🇹", "ja": "🇯🇵", "ko": "🇰🇷", "lt": "🇱🇹", + "lv": "🇱🇻", "ms": "🇲🇾", "nl": "🇳🇱", "no": "🇳🇴", "pl": "🇵🇱", + "pt": "🇧🇷", "ro": "🇷🇴", "ru": "🇷🇺", "sk": "🇸🇰", "sl": "🇸🇮", + "sv": "🇸🇪", "th": "🇹🇭", "tr": "🇹🇷", "uk": "🇺🇦", "vi": "🇻🇳", + "zh": "🇨🇳", + } + lines = [] + for locale, count in sorted(locale_counts.items()): + prefix = locale.split("-")[0] + flag = locale_flags.get(prefix, "🌐") + lines.append(f"{flag} `{locale}` — {count} voices") + header = f"🗣 Available locales ({len(locale_counts)}):\n\n" + + await safe_reply(update.message, header + "\n".join(lines)) + except Exception as e: + err = str(e) + if "503" in err or "Service Unavailable" in err: + await safe_reply( + update.message, + "⚠ Microsoft TTS service is temporarily unavailable (503).\n" + "Try again in a few seconds.", + ) + else: + await safe_reply(update.message, f"❌ Failed to list voices: {e}") + + async def esc_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: """Send Escape key to interrupt Claude.""" user = update.effective_user @@ -642,11 +767,14 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N if not update.message or not update.message.voice: return - if not config.openai_api_key: + stt_available = ( + config.stt_engine == "whisper" or config.openai_api_key + ) + if not stt_available: await safe_reply( update.message, - "⚠ Voice transcription requires an OpenAI API key.\n" - "Set `OPENAI_API_KEY` in your `.env` file and restart the bot.", + "⚠ No STT backend available.\n" + "Set CCBOT_STT_ENGINE=whisper (local) or OPENAI_API_KEY (API) in .env.", ) return @@ -1792,6 +1920,8 @@ async def handle_new_message(msg: NewMessage, bot: Bot) -> None: text=msg.text, thread_id=thread_id, image_data=msg.image_data, + role=msg.role, + is_complete=msg.is_complete, ) # Update user's read offset to current file position @@ -1895,6 +2025,8 @@ def create_bot() -> Application: application.add_handler(CommandHandler("screenshot", screenshot_command)) application.add_handler(CommandHandler("esc", esc_command)) application.add_handler(CommandHandler("unbind", unbind_command)) + application.add_handler(CommandHandler("voice", voice_command)) + application.add_handler(CommandHandler("voices", voices_command)) application.add_handler(CommandHandler("usage", usage_command)) application.add_handler(CallbackQueryHandler(callback_handler)) # Topic closed event — auto-kill associated window diff --git a/src/ccbot/config.py b/src/ccbot/config.py index 22d1de76..f3824a0c 100644 --- a/src/ccbot/config.py +++ b/src/ccbot/config.py @@ -101,12 +101,35 @@ def __init__(self) -> None: os.getenv("CCBOT_SHOW_HIDDEN_DIRS", "").lower() == "true" ) - # OpenAI API for voice message transcription (optional) + # STT engine: "whisper" (local, default) or "openai" (API) + self.stt_engine: str = os.getenv("CCBOT_STT_ENGINE", "whisper") + # Whisper config (local STT via faster-whisper + CTranslate2 + CUDA) + self.whisper_model: str = os.getenv("CCBOT_WHISPER_MODEL", "large-v3") + self.whisper_device: str = os.getenv("CCBOT_WHISPER_DEVICE", "cuda") + self.whisper_compute_type: str = os.getenv( + "CCBOT_WHISPER_COMPUTE_TYPE", "float16" + ) + # OpenAI API for voice transcription (fallback when stt_engine=openai) self.openai_api_key: str = os.getenv("OPENAI_API_KEY", "") self.openai_base_url: str = os.getenv( "OPENAI_BASE_URL", "https://api.openai.com/v1" ) + # TTS (Text-to-Speech) via edge-tts (Microsoft Edge neural voices) + self.tts_enabled: bool = os.getenv("CCBOT_TTS_ENABLED", "true").lower() in ( + "true", + "1", + "yes", + ) + self.tts_auto: bool = os.getenv("CCBOT_TTS_AUTO", "false").lower() in ( + "true", + "1", + "yes", + ) + self.tts_voice: str = os.getenv( + "CCBOT_TTS_VOICE", "es-ES-ElviraNeural" + ) + # Scrub sensitive vars from os.environ so child processes never inherit them. # Values are already captured in Config attributes above. for var in SENSITIVE_ENV_VARS: diff --git a/src/ccbot/handlers/message_queue.py b/src/ccbot/handlers/message_queue.py index bdd28038..79c124aa 100644 --- a/src/ccbot/handlers/message_queue.py +++ b/src/ccbot/handlers/message_queue.py @@ -64,6 +64,8 @@ class MessageTask: content_type: str = "text" thread_id: int | None = None # Telegram topic thread_id for targeted send image_data: list[tuple[str, bytes]] | None = None # From tool_result images + role: str = "assistant" # "user" or "assistant" + is_complete: bool = False # True when message is final (stop_reason set) # Per-user message queues and worker tasks @@ -192,6 +194,8 @@ async def _merge_content_tasks( tool_use_id=first.tool_use_id, content_type=first.content_type, thread_id=first.thread_id, + role=first.role, + is_complete=first.is_complete, ), merge_count, ) @@ -381,7 +385,20 @@ async def _process_content_task(bot: Bot, user_id: int, task: MessageTask) -> No # 4. Send images if present (from tool_result with base64 image blocks) await _send_task_images(bot, chat_id, task) - # 5. After content, check and send status + # 5. Send TTS voice message if enabled (only for final assistant text) + if ( + task.content_type == "text" + and task.role == "assistant" + and task.is_complete + and task.parts + ): + from ..tts import is_tts_enabled, send_voice_message + + if is_tts_enabled(user_id): + full_text = "\n\n".join(task.parts) + await send_voice_message(bot, chat_id, full_text, task.thread_id, user_id) + + # 6. After content, check and send status await _check_and_send_status(bot, user_id, wid, task.thread_id) @@ -601,6 +618,8 @@ async def enqueue_content_message( text: str | None = None, thread_id: int | None = None, image_data: list[tuple[str, bytes]] | None = None, + role: str = "assistant", + is_complete: bool = False, ) -> None: """Enqueue a content message task.""" logger.debug( @@ -620,6 +639,8 @@ async def enqueue_content_message( content_type=content_type, thread_id=thread_id, image_data=image_data, + role=role, + is_complete=is_complete, ) queue.put_nowait(task) diff --git a/src/ccbot/tts.py b/src/ccbot/tts.py new file mode 100644 index 00000000..74e2bc42 --- /dev/null +++ b/src/ccbot/tts.py @@ -0,0 +1,121 @@ +"""Text-to-speech via Microsoft Edge neural voices. + +Provides async TTS using edge-tts to generate OGG/Opus audio for Telegram +voice messages. Voice selection and TTS toggle are per-user. + +Key functions: + - synthesize: Generate OGG audio bytes from text + - send_voice_message: Send voice message to Telegram chat + - is_tts_enabled: Check if TTS is active for a user + - toggle_tts: Toggle TTS on/off for a user + +Dependencies: edge-tts (Microsoft Edge TTS, free, no API key) +""" + +import logging +from pathlib import Path + +from .config import config + +logger = logging.getLogger(__name__) + +_per_user_tts: dict[int, bool] = {} +_per_user_voice: dict[int, str] = {} + +_audio_dir: Path | None = None + + +def _get_audio_dir() -> Path: + global _audio_dir + if _audio_dir is not None: + return _audio_dir + from .utils import ccbot_dir + + d = ccbot_dir() / "audio" + d.mkdir(parents=True, exist_ok=True) + _audio_dir = d + return _audio_dir + + +def is_tts_enabled(user_id: int) -> bool: + """Check if TTS is enabled for a user (global + per-user).""" + if not config.tts_enabled: + return False + return _per_user_tts.get(user_id, config.tts_auto) is not False + + +def toggle_tts(user_id: int) -> bool: + """Toggle TTS for a user. Returns new state.""" + _per_user_tts[user_id] = not is_tts_enabled(user_id) + return _per_user_tts[user_id] + + +def get_voice(user_id: int) -> str: + """Get the TTS voice for a user (per-user override or global default).""" + return _per_user_voice.get(user_id, config.tts_voice) + + +def set_voice(user_id: int, voice: str) -> str: + """Set a per-user voice override. Returns the voice name set.""" + _per_user_voice[user_id] = voice + return voice + + +async def synthesize(text: str, user_id: int | None = None, voice: str | None = None) -> bytes: + """Synthesize text to OGG/Opus audio bytes using edge-tts. + + Args: + text: Text to synthesize (max ~4000 chars for Telegram voice). + user_id: User ID for per-user voice selection. + voice: Voice name override (takes priority over user_id). + + Returns: + OGG/Opus audio bytes ready for Telegram send_voice. + """ + if not voice and user_id: + voice = get_voice(user_id) + voice = voice or config.tts_voice + + import edge_tts + + audio_dir = _get_audio_dir() + tmp_path = audio_dir / f"tts_{id(text) % 10**8}.ogg" + + try: + communicate = edge_tts.Communicate(text, voice) + await communicate.save(str(tmp_path)) + data = tmp_path.read_bytes() + if not data: + raise ValueError("TTS produced empty audio") + return data + finally: + tmp_path.unlink(missing_ok=True) + + +async def send_voice_message( + bot, chat_id: int, text: str, thread_id: int | None = None, user_id: int | None = None +) -> None: + """Send text as a Telegram voice message. + + Generates audio via edge-tts and sends as a voice note. + Falls back silently to text-only if TTS fails. + """ + from telegram.constants import ChatAction + + truncated = text[:4000] + try: + await bot.send_chat_action(chat_id=chat_id, action=ChatAction.RECORD_VOICE) + audio_data = await synthesize(truncated, user_id=user_id) + kwargs = {"chat_id": chat_id, "voice": audio_data} + if thread_id is not None: + kwargs["message_thread_id"] = thread_id + await bot.send_voice(**kwargs) + except Exception: + logger.warning("TTS failed, skipping voice message", exc_info=True) + + +async def close() -> None: + """Cleanup TTS resources.""" + global _audio_dir + _per_user_tts.clear() + _audio_dir = None diff --git a/tests/ccbot/test_tts.py b/tests/ccbot/test_tts.py new file mode 100644 index 00000000..ee5e33e7 --- /dev/null +++ b/tests/ccbot/test_tts.py @@ -0,0 +1,64 @@ +"""Tests for TTS module.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from ccbot.tts import is_tts_enabled, toggle_tts + + +@pytest.fixture +def mock_config(): + with patch.object( + __import__("ccbot.tts", fromlist=["config"]).config, + "tts_enabled", + True, + create=True, + ), patch.object( + __import__("ccbot.tts", fromlist=["config"]).config, + "tts_auto", + False, + create=True, + ), patch.object( + __import__("ccbot.tts", fromlist=["config"]).config, + "tts_voice", + "es-ES-ElviraNeural", + create=True, + ): + from ccbot import tts + + tts._per_user_tts.clear() + yield tts.config + + +class TestTTSToggle: + def test_toggle_on_by_default_tts_auto_false(self, mock_config): + assert is_tts_enabled(12345) is False + + def test_toggle_on(self, mock_config): + new_state = toggle_tts(12345) + assert new_state is True + assert is_tts_enabled(12345) is True + + def test_toggle_off(self, mock_config): + toggle_tts(12345) + new_state = toggle_tts(12345) + assert new_state is False + assert is_tts_enabled(12345) is False + + def test_per_user_isolation(self, mock_config): + toggle_tts(100) + assert is_tts_enabled(100) is True + assert is_tts_enabled(200) is False + + +class TestTTSGlobalDisabled: + def test_global_disabled_ignores_per_user(self): + with patch( + "ccbot.tts.config", + MagicMock(tts_enabled=False, tts_auto=False, tts_voice="test"), + ): + from ccbot import tts + + tts._per_user_tts.clear() + assert is_tts_enabled(12345) is False From b9307f76a26dd5c3596e0d35bf2996d840280cf3 Mon Sep 17 00:00:00 2001 From: Pedro Luis Cuevas Villarrubia Date: Tue, 24 Mar 2026 19:06:32 +0100 Subject: [PATCH 2/5] fix: voice validation, text cleanup, merge role preservation - Validate voice names in set_voice() to prevent invalid names like '/voices' from crashing edge-tts (ValueError) - Add clean_text_for_tts() to strip emojis, symbols, and markdown artifacts before TTS synthesis for cleaner audio - Preserve 'assistant' role when merging mixed-role tasks so TTS isn't skipped when user+assistant messages are batch-merged - Add 5 new tests for text cleanup Co-Authored-By: Claude Opus 4.6 --- src/ccbot/bot.py | 6 ++- src/ccbot/handlers/message_queue.py | 9 ++++- src/ccbot/tts.py | 62 ++++++++++++++++++++++++++++- tests/ccbot/test_tts.py | 47 ++++++++++++++++++++++ 4 files changed, 120 insertions(+), 4 deletions(-) diff --git a/src/ccbot/bot.py b/src/ccbot/bot.py index 4e7e40d1..5482d1bc 100644 --- a/src/ccbot/bot.py +++ b/src/ccbot/bot.py @@ -300,7 +300,11 @@ async def voice_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N # /voice — set voice (auto-enable TTS) if args: voice_name = args[0] - set_voice(user.id, voice_name) + try: + set_voice(user.id, voice_name) + except ValueError as e: + await safe_reply(update.message, f"❌ {e}\nUse /voices to see available voices.") + return if not is_tts_enabled(user.id): toggle_tts(user.id) await safe_reply( diff --git a/src/ccbot/handlers/message_queue.py b/src/ccbot/handlers/message_queue.py index 79c124aa..afa7f750 100644 --- a/src/ccbot/handlers/message_queue.py +++ b/src/ccbot/handlers/message_queue.py @@ -186,6 +186,11 @@ async def _merge_content_tasks( if merge_count == 0: return first, 0 + # Preserve assistant role when merging mixed-role tasks (e.g. user + assistant). + # TTS and other role-dependent features rely on the correct role. + merged_role = "assistant" if any(t.role == "assistant" for t in items) else first.role + merged_complete = any(t.is_complete for t in items) + return ( MessageTask( task_type="content", @@ -194,8 +199,8 @@ async def _merge_content_tasks( tool_use_id=first.tool_use_id, content_type=first.content_type, thread_id=first.thread_id, - role=first.role, - is_complete=first.is_complete, + role=merged_role, + is_complete=merged_complete, ), merge_count, ) diff --git a/src/ccbot/tts.py b/src/ccbot/tts.py index 74e2bc42..245bb78f 100644 --- a/src/ccbot/tts.py +++ b/src/ccbot/tts.py @@ -13,12 +13,67 @@ """ import logging +import re from pathlib import Path from .config import config logger = logging.getLogger(__name__) +# Regex patterns to strip non-speech content before TTS +_TTS_CLEANUP = [ + # Emojis (common ranges) + re.compile( + "[" + "\U0001F600-\U0001F64F" # emoticons + "\U0001F300-\U0001F5FF" # symbols & pictographs + "\U0001F680-\U0001F6FF" # transport & map + "\U0001F700-\U0001F77F" # alchemical symbols + "\U0001F780-\U0001F7FF" # geometric shapes extended + "\U0001F800-\U0001F8FF" # supplemental arrows-C + "\U0001F900-\U0001F9FF" # supplemental symbols & pictographs + "\U0001FA00-\U0001FA6F" # chess symbols + "\U0001FA70-\U0001FAFF" # symbols & pictographs extended-A + "\U00002702-\U000027B0" # dingbats + "\U000024C2-\U0001F251" # enclosed characters + "\U0001F200-\U0001F2FF" # enclosed ideographic supplement + "\U00002600-\U000026FF" # misc symbols + "\U00002700-\U000027BF" # dingbats (overlap, intentional) + "\U0000FE00-\U0000FE0F" # variation selectors + "\U0000200D" # zero-width joiner + "]+", + flags=re.UNICODE, + ), + # Telegram-style expandable quotes and blockquotes + re.compile(r"[▁-▉]+"), + # Markdown/code artifacts + re.compile(r"[*_`~#|>]+"), + # Arrow-like symbols + re.compile(r"[→←↑↓↔↕➜➤➡⇒⇐⇑⇓⇔⇕]+"), + # Decorative box-drawing and block elements + re.compile(r"[═║╔╗╚╝╠╣╦╩─│┌┐└┘├┤┬┴┼]+"), + # Bullet points and list markers + re.compile(r"[•●○◦▪▫➢➣➤◆◇★☆►◄▲▼]+"), + # Misc symbols that TTS reads badly + re.compile(r"[⚡🔥💡✅❌⚠️🔊🗣💡]+"), + # Multiple consecutive whitespace + re.compile(r"\n{3,}", re.MULTILINE), + # Code fences + re.compile(r"```[\s\S]*?```"), +] + + +def clean_text_for_tts(text: str) -> str: + """Strip emojis, symbols, and markdown artifacts for TTS synthesis. + + Keeps normal punctuation (.,;:!?¿¡), letters, numbers, and whitespace. + Collapses multiple newlines to double newlines for natural pauses. + """ + for pattern in _TTS_CLEANUP: + text = pattern.sub("", text) + text = text.strip() + return text if text else "" + _per_user_tts: dict[int, bool] = {} _per_user_voice: dict[int, str] = {} @@ -57,6 +112,9 @@ def get_voice(user_id: int) -> str: def set_voice(user_id: int, voice: str) -> str: """Set a per-user voice override. Returns the voice name set.""" + # Basic sanity: voice names are like "es-ES-ElviraNeural", not commands + if "/" in voice or voice.startswith(("list", "all")): + raise ValueError(f"'{voice}' doesn't look like a voice name.") _per_user_voice[user_id] = voice return voice @@ -102,7 +160,9 @@ async def send_voice_message( """ from telegram.constants import ChatAction - truncated = text[:4000] + truncated = clean_text_for_tts(text[:4000]) + if not truncated: + return try: await bot.send_chat_action(chat_id=chat_id, action=ChatAction.RECORD_VOICE) audio_data = await synthesize(truncated, user_id=user_id) diff --git a/tests/ccbot/test_tts.py b/tests/ccbot/test_tts.py index ee5e33e7..6c54d474 100644 --- a/tests/ccbot/test_tts.py +++ b/tests/ccbot/test_tts.py @@ -62,3 +62,50 @@ def test_global_disabled_ignores_per_user(self): tts._per_user_tts.clear() assert is_tts_enabled(12345) is False + + +class TestCleanTextForTTS: + def test_strips_emojis(self): + from ccbot.tts import clean_text_for_tts + + result = clean_text_for_tts("Hola 👋 ¿cómo estás? 😊") + assert "👋" not in result + assert "😊" not in result + assert "Hola" in result + assert "cómo estás" in result + + def test_strips_markdown(self): + from ccbot.tts import clean_text_for_tts + + result = clean_text_for_tts("## Título **negrita** y `código`") + assert "##" not in result + assert "**" not in result + assert "`" not in result + assert "Título" in result + assert "negrita" in result + + def test_strips_arrows_and_symbols(self): + from ccbot.tts import clean_text_for_tts + + result = clean_text_for_tts("⚠️ Error → solución ✅") + assert "⚠️" not in result + assert "→" not in result + assert "✅" not in result + assert "Error" in result + assert "solución" in result + + def test_keeps_normal_punctuation(self): + from ccbot.tts import clean_text_for_tts + + result = clean_text_for_tts("¡Hola! ¿Qué tal? Bien, gracias.") + assert "¡" in result + assert "!" in result + assert "¿" in result + assert "?" in result + assert "." in result + + def test_empty_after_clean(self): + from ccbot.tts import clean_text_for_tts + + result = clean_text_for_tts("⚠️⚡🔥") + assert result == "" From 1921bd594bfea41ba9f946933850ac499879c2fd Mon Sep 17 00:00:00 2001 From: Pedro Luis Cuevas Villarrubia Date: Tue, 24 Mar 2026 19:21:04 +0100 Subject: [PATCH 3/5] fix: compute merge metadata from actually merged tasks only merged_role and merged_complete were derived from all drained queue items including non-merged ones put back. Now uses [first] + items[:merge_count] to avoid incorrectly labeling merged tasks when later non-mergeable tasks have assistant role or is_complete=True. Co-Authored-By: Claude Opus 4.6 --- src/ccbot/handlers/message_queue.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ccbot/handlers/message_queue.py b/src/ccbot/handlers/message_queue.py index afa7f750..c21e9dc9 100644 --- a/src/ccbot/handlers/message_queue.py +++ b/src/ccbot/handlers/message_queue.py @@ -188,8 +188,10 @@ async def _merge_content_tasks( # Preserve assistant role when merging mixed-role tasks (e.g. user + assistant). # TTS and other role-dependent features rely on the correct role. - merged_role = "assistant" if any(t.role == "assistant" for t in items) else first.role - merged_complete = any(t.is_complete for t in items) + # Only consider tasks that were actually merged, not remaining ones put back. + merged_tasks = [first] + items[:merge_count] + merged_role = "assistant" if any(t.role == "assistant" for t in merged_tasks) else first.role + merged_complete = any(t.is_complete for t in merged_tasks) return ( MessageTask( From 1d9a3312c810bf4ad9a579e9b09bb782af5bbb85 Mon Sep 17 00:00:00 2001 From: Pedro Luis Cuevas Villarrubia Date: Tue, 24 Mar 2026 19:37:06 +0100 Subject: [PATCH 4/5] fix: move code fence cleanup before markdown backtick stripping The markdown cleanup pattern stripped backticks before the code fence pattern could match, leaving code block content as orphan text in TTS. Now code fences are removed first (with their content), then remaining inline backticks are cleaned up. Co-Authored-By: Claude Opus 4.6 --- src/ccbot/tts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ccbot/tts.py b/src/ccbot/tts.py index 245bb78f..777f5852 100644 --- a/src/ccbot/tts.py +++ b/src/ccbot/tts.py @@ -44,9 +44,11 @@ "]+", flags=re.UNICODE, ), + # Code fences (must run BEFORE markdown cleanup strips the backticks) + re.compile(r"```[\s\S]*?```"), # Telegram-style expandable quotes and blockquotes re.compile(r"[▁-▉]+"), - # Markdown/code artifacts + # Markdown/code artifacts (inline only — fences handled above) re.compile(r"[*_`~#|>]+"), # Arrow-like symbols re.compile(r"[→←↑↓↔↕➜➤➡⇒⇐⇑⇓⇔⇕]+"), @@ -58,8 +60,6 @@ re.compile(r"[⚡🔥💡✅❌⚠️🔊🗣💡]+"), # Multiple consecutive whitespace re.compile(r"\n{3,}", re.MULTILINE), - # Code fences - re.compile(r"```[\s\S]*?```"), ] From 0713990fff23977884bc02e83d039296bdf3b8cb Mon Sep 17 00:00:00 2001 From: Pedro Luis Cuevas Villarrubia Date: Tue, 24 Mar 2026 19:48:26 +0100 Subject: [PATCH 5/5] fix: add leading pause to prevent first-word truncation in TTS audio OGG/Opus encoding trims the first few milliseconds, cutting the initial phoneme. Prefixing text with "... " forces edge-tts to start with a brief silence, preserving the first word intact. Co-Authored-By: Claude Opus 4.6 --- src/ccbot/tts.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ccbot/tts.py b/src/ccbot/tts.py index 777f5852..7f92539c 100644 --- a/src/ccbot/tts.py +++ b/src/ccbot/tts.py @@ -163,6 +163,8 @@ async def send_voice_message( truncated = clean_text_for_tts(text[:4000]) if not truncated: return + # Prefix with brief pause to prevent first-word truncation in OGG/Opus encoding + truncated = "... " + truncated try: await bot.send_chat_action(chat_id=chat_id, action=ChatAction.RECORD_VOICE) audio_data = await synthesize(truncated, user_id=user_id)