From 3da054a2a8a34e8a7f9e257130546afeb94f0706 Mon Sep 17 00:00:00 2001 From: Eddie Offermann Date: Fri, 18 Apr 2025 14:11:58 -0700 Subject: [PATCH 1/2] Added a gradio interface to make this easy for anyone to use --- app.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 app.py diff --git a/app.py b/app.py new file mode 100644 index 0000000..d5904d1 --- /dev/null +++ b/app.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""app.py +Gradio UI for on‑demand text‑to‑audio synthesis. + +Key change +~~~~~~~~~~ +Audio is **no longer written to disk**. Generated clips are kept in memory and +fed directly to :class:`gr.Audio` widgets (``type="numpy"``). Users can still +preview and download each clip individually via the built‑in download button, +but no permanent archive is created under ``./results``. +""" +from __future__ import annotations + +from typing import List + +import gradio as gr +import numpy as np + +from gen_wav import SAMPLE_RATE, device, gen_wav, initialize_model # type: ignore +from vocoder.bigvgan.models import VocoderBigVGAN # type: ignore + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +MAX_AUDIO_PLAYERS: int = 10 # simultaneous preview widgets + +SAMPLER = initialize_model( + config="configs/text_to_audio/txt2audio_args.yaml", + ckpt="useful_ckpts/maa1_full.ckpt", +) +VOCODER = VocoderBigVGAN("useful_ckpts/bigvgan", device=device) + + +# --------------------------------------------------------------------------- +# Callback +# --------------------------------------------------------------------------- + +def generate_and_update( + prompt: str, + ddim_steps: int, + duration: int, + n_samples: int, + scale: float, +): + """Generate *n_samples* audio clips and build Gradio update payloads.""" + # ``gen_wav`` returns a list[numpy.ndarray] with values in [-1, 1]. + wavs = gen_wav( + sampler=SAMPLER, + vocoder=VOCODER, + prompt=prompt, + ddim_steps=ddim_steps, + scale=scale, + duration=duration, + n_samples=n_samples, + ) + + updates: List[gr.update] = [] + for i in range(MAX_AUDIO_PLAYERS): + if i < len(wavs): + # Gradio expects (sample_rate, np.ndarray) for ``type="numpy"``. + updates.append(gr.update(value=(SAMPLE_RATE, wavs[i]), visible=True)) + else: + updates.append(gr.update(value=None, visible=False)) + return updates + + +# --------------------------------------------------------------------------- +# UI +# --------------------------------------------------------------------------- + +def build_ui() -> gr.Blocks: + """Assemble and return the Gradio Blocks interface.""" + with gr.Blocks() as demo: + gr.Markdown("## Text‑to‑Audio Generator 🎙️") + + prompt_in = gr.Textbox( + label="Prompt", + value="a bird chirps", + placeholder="Describe the sound you want to generate…", + ) + ddim_in = gr.Slider(label="DDIM Steps", minimum=1, maximum=500, value=100) + duration_in = gr.Slider(label="Duration (s)", minimum=1, maximum=60, value=10) + samples_in = gr.Slider( + label="Number of Samples", + minimum=1, + maximum=MAX_AUDIO_PLAYERS, + value=1, + step=1, + ) + scale_in = gr.Slider( + label="Guidance Scale", minimum=0.0, maximum=10.0, value=3.0, step=0.1 + ) + generate_btn = gr.Button("Generate") + + gr.Markdown("### Preview & Download") + audio_players = [ + gr.Audio(label=f"Clip {i+1}", visible=False, type="numpy") + for i in range(MAX_AUDIO_PLAYERS) + ] + + generate_btn.click( + fn=generate_and_update, + inputs=[prompt_in, ddim_in, duration_in, samples_in, scale_in], + outputs=audio_players, + ) + return demo + + +def main() -> None: + """Launch the Gradio interface in the default browser.""" + build_ui().launch(inbrowser=True) + + +if __name__ == "__main__": + main() From 697c36fac3760e1ad32019668222d84275a9b943 Mon Sep 17 00:00:00 2001 From: Eddie Offermann Date: Fri, 18 Apr 2025 14:36:08 -0700 Subject: [PATCH 2/2] Updated docstrings --- app.py | 126 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 24 deletions(-) diff --git a/app.py b/app.py index d5904d1..9ca4fb5 100644 --- a/app.py +++ b/app.py @@ -1,29 +1,53 @@ #!/usr/bin/env python3 -"""app.py -Gradio UI for on‑demand text‑to‑audio synthesis. - -Key change -~~~~~~~~~~ -Audio is **no longer written to disk**. Generated clips are kept in memory and -fed directly to :class:`gr.Audio` widgets (``type="numpy"``). Users can still -preview and download each clip individually via the built‑in download button, -but no permanent archive is created under ``./results``. +"""Text-to-Audio Gradio UI (`app.py`). + +This script launches an interactive **Gradio** web application that turns text +prompts into audio clips *on-demand*. + +It wires together three high-level components: + +1. **Diffusion Sampler** - A pre-trained latent-diffusion model that converts a + text embedding into a mel-spectrogram-like latent representation. +2. **Neural Vocoder (BigVGAN)** - Translates the latent representation into a + time-domain waveform at :data:`SAMPLE_RATE`. +3. **Gradio Front-End** - Provides a simple web UI for prompt entry and audio + preview/downloading. Nothing is written to disk unless the user explicitly + clicks each widget’s *Download* button. + +Execution is self-contained and *stateless*: no caching, no temp files, and no +server-side persistence. All heavy lifting happens in GPU memory (*if* the +imported ``device`` points at CUDA) and gets released when the app exits. + +Example +------- +Run the application from the command line: + +```bash +$ python app.py # or ./app.py after chmod +x +``` + +The default browser should automatically open to +``http://127.0.0.1:7860`` showing the UI. """ from __future__ import annotations from typing import List import gradio as gr -import numpy as np +import numpy as np # NumPy is required by Gradio when ``type="numpy"``. from gen_wav import SAMPLE_RATE, device, gen_wav, initialize_model # type: ignore from vocoder.bigvgan.models import VocoderBigVGAN # type: ignore # --------------------------------------------------------------------------- -# Configuration +# Configuration & Model Initialisation # --------------------------------------------------------------------------- -MAX_AUDIO_PLAYERS: int = 10 # simultaneous preview widgets +#: Maximum number of *simultaneous* ``gr.Audio`` preview widgets shown. +MAX_AUDIO_PLAYERS: int = 10 + +# Instantiate the diffusion sampler and the neural vocoder **once** at import +# time so that we pay model-loading latency only on application start-up. SAMPLER = initialize_model( config="configs/text_to_audio/txt2audio_args.yaml", ckpt="useful_ckpts/maa1_full.ckpt", @@ -32,7 +56,7 @@ # --------------------------------------------------------------------------- -# Callback +# Generation Callback # --------------------------------------------------------------------------- def generate_and_update( @@ -41,9 +65,36 @@ def generate_and_update( duration: int, n_samples: int, scale: float, -): - """Generate *n_samples* audio clips and build Gradio update payloads.""" - # ``gen_wav`` returns a list[numpy.ndarray] with values in [-1, 1]. +) -> List[gr.update]: + """Generate *n_samples* audio clips and create Gradio update payloads. + + Args: + prompt: Natural-language description of the desired sound. + ddim_steps: Number of DDIM inference steps for the diffusion sampler. + duration: Desired clip length in **seconds**. + n_samples: Number of distinct clips to synthesize (≤ 10 ``MAX_AUDIO_PLAYERS``). + scale: Classifier-free guidance scale. + + Returns: + A list of :class:`gr.update` objects (length == ``MAX_AUDIO_PLAYERS``) + where each entry either: + + * reveals an audio player with ``(sample_rate, waveform)`` *tuple* for + Gradio to render, **or** + * hides the widget (``visible=False``) if no clip exists for that slot. + + Note: + *Generated waveforms* are kept **in-memory only**. Downloading is left + to the built-in feature of each ``gr.Audio`` component. + + TODO: + • Catch and surface ``RuntimeError`` if GPU memory is insufficient. + • Validate *duration* and *ddim_steps* against model limitations. + """ + + # --------------------------------------------------------------------- + # Waveform synthesis - returns a ``List[np.ndarray]`` with values in [-1, 1]. + # --------------------------------------------------------------------- wavs = gen_wav( sampler=SAMPLER, vocoder=VOCODER, @@ -54,32 +105,50 @@ def generate_and_update( n_samples=n_samples, ) + # --------------------------------------------------------------------- + # Gradio expects (sample_rate, waveform) when ``type='numpy'``. Build a + # fixed-length list so that every output target is accounted for. + # --------------------------------------------------------------------- updates: List[gr.update] = [] for i in range(MAX_AUDIO_PLAYERS): if i < len(wavs): - # Gradio expects (sample_rate, np.ndarray) for ``type="numpy"``. - updates.append(gr.update(value=(SAMPLE_RATE, wavs[i]), visible=True)) + updates.append( + gr.update(value=(SAMPLE_RATE, wavs[i]), visible=True) + ) else: updates.append(gr.update(value=None, visible=False)) + return updates # --------------------------------------------------------------------------- -# UI +# UI Construction # --------------------------------------------------------------------------- def build_ui() -> gr.Blocks: - """Assemble and return the Gradio Blocks interface.""" + """Compose the Gradio Blocks and return the root component. + + The layout is kept intentionally minimal: + + * **Input panel** - Prompt, DDIM steps, duration, sample count, guidance. + * **Generate** button - Triggers diffusion + vocoder inference. + * **Preview & Download** - Dynamically reveals up to + :data:`MAX_AUDIO_PLAYERS` audio players. + """ + with gr.Blocks() as demo: - gr.Markdown("## Text‑to‑Audio Generator 🎙️") + gr.Markdown("## Text-to-Audio Generator 🎙️") + # ------------------------- Input Widgets ------------------------ # prompt_in = gr.Textbox( label="Prompt", value="a bird chirps", placeholder="Describe the sound you want to generate…", ) ddim_in = gr.Slider(label="DDIM Steps", minimum=1, maximum=500, value=100) - duration_in = gr.Slider(label="Duration (s)", minimum=1, maximum=60, value=10) + duration_in = gr.Slider( + label="Duration (s)", minimum=1, maximum=60, value=10 + ) samples_in = gr.Slider( label="Number of Samples", minimum=1, @@ -92,22 +161,31 @@ def build_ui() -> gr.Blocks: ) generate_btn = gr.Button("Generate") + # ------------------------ Output Widgets ------------------------ # gr.Markdown("### Preview & Download") audio_players = [ - gr.Audio(label=f"Clip {i+1}", visible=False, type="numpy") + gr.Audio(label=f"Clip {i + 1}", visible=False, type="numpy") for i in range(MAX_AUDIO_PLAYERS) ] + # ------------------------- Event Binding ------------------------ # generate_btn.click( fn=generate_and_update, inputs=[prompt_in, ddim_in, duration_in, samples_in, scale_in], outputs=audio_players, ) + return demo +# --------------------------------------------------------------------------- +# Entry Point +# --------------------------------------------------------------------------- + def main() -> None: - """Launch the Gradio interface in the default browser.""" + """Launch the Gradio interface in the user’s default browser.""" + + # `inbrowser=True` attempts to open the local server URL automatically. build_ui().launch(inbrowser=True)