Skip to content

Commit c6fd867

Browse files
authored
feat(core): impl tts & video gen (#23)
1 parent 60796eb commit c6fd867

File tree

12 files changed

+455
-5
lines changed

12 files changed

+455
-5
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,21 @@ gems list
120120
gems imagen -t "Hi, can you create a 3d rendered image of a pig with wings and a top hat flying over a happy futuristic scifi city with lots of greenery?"
121121
```
122122

123+
### Generate a Video
124+
125+
```sh
126+
gems vidgen -t "Yo, generate a humble bumble golden retriever puppy running through a flower field."
127+
```
128+
129+
### Text to speech
130+
131+
```sh
132+
gems tts -t "Yo, Say 'Hello' with a humble bumble voice!."
133+
134+
# Then install `ffmpeg` and run:
135+
ffmpeg -f s16le -ar 24000 -ac 1 -i output.pcm out.wav
136+
```
137+
123138
### TUI mode
124139

125140
```sh
@@ -143,6 +158,8 @@ gems
143158
| `vision` | Analyze an image and generate content from text. |
144159
| `stream` | Stream the generation of content. |
145160
| `imagen` | Generate an image. |
161+
| `vidgen` | Generate a video. |
162+
| `tts` | Text to speech. |
146163
| `count` | Count the number of tokens in a text. |
147164
| `embed` | Embed content into a specified model. |
148165
| `batch` | Batch embed multiple contents. |

src/cli.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,24 @@ EXAMPLES:
7272
Batch embed contents:
7373
gems -m 'embedding-001' batch -t "Write a story about a magic backpack.","Generate a poem about nature."
7474
75+
Generate an Image:
76+
gems imagen -t "Yo, create a 3d rendered image of a cat with wings."
77+
78+
Generate a Video:
79+
gems vidgen -t "Yo, generate a humble bumble golden retriever puppy running through a flower field."
80+
81+
Text to speech:
82+
gems tts -t "Yo, Say 'Hello' with a humble bumble voice!."
83+
7584
Get model info:
7685
gems info
7786
7887
List models:
7988
gems list
8089
90+
TUI mode:
91+
gems
92+
8193
For more information, visit: github.com/kevin-rs/gems
8294
"#
8395
)]
@@ -104,6 +116,8 @@ pub enum Command {
104116
Info(Info),
105117
List(List),
106118
Imagen(Imagen),
119+
Vidgen(Vidgen),
120+
Tts(Tts),
107121
}
108122

109123
#[cfg(feature = "cli")]
@@ -172,3 +186,22 @@ pub struct Imagen {
172186
#[arg(short, long, default_value_t = String::from("Hi, step bro... I need help generating a happy, humble, bumble Rustacean. he's stuck in the shower and won't compile."))]
173187
pub text: String,
174188
}
189+
190+
#[cfg(feature = "cli")]
191+
#[derive(Args, Debug, Clone)]
192+
pub struct Vidgen {
193+
/// The text to generate video from.
194+
#[arg(short, long, default_value_t = String::from("A humble bumble golden retriever puppy running through a flower field"))]
195+
pub text: String,
196+
}
197+
198+
#[cfg(feature = "cli")]
199+
#[derive(Args, Debug, Clone)]
200+
pub struct Tts {
201+
/// The prompt.
202+
#[arg(short, long, default_value_t = String::from("Say cheerfully: Have a wonderful day!"))]
203+
pub text: String,
204+
/// The voice.
205+
#[arg(short, long, default_value_t = String::from("Kore"))]
206+
pub voice: String,
207+
}

src/client.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ use crate::models::Models;
66
use crate::stream::Streaming;
77
use crate::tokens::Tokens;
88
use crate::traits::CTrait;
9+
use crate::tts::Tts;
10+
use crate::vidgen::Videos;
911
use crate::vision::Visions;
1012
use anyhow::anyhow;
1113
use anyhow::Result;
@@ -66,7 +68,6 @@ impl CTrait for Client {
6668
endpoint
6769
)
6870
};
69-
7071
let parsed_url = Url::parse_with_params(&full_url, &[("key", api_key)]).unwrap();
7172

7273
Ok(self
@@ -115,6 +116,18 @@ impl CTrait for Client {
115116
client: self.clone(),
116117
}
117118
}
119+
120+
fn videos(&self) -> Videos {
121+
Videos {
122+
client: self.clone(),
123+
}
124+
}
125+
126+
fn tts(&self) -> Tts {
127+
Tts {
128+
client: self.clone(),
129+
}
130+
}
118131
}
119132

120133
#[derive(Default)]

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ pub mod responses;
1212
pub mod stream;
1313
pub mod tokens;
1414
pub mod traits;
15+
pub mod tts;
1516
pub mod utils;
17+
pub mod vidgen;
1618
pub mod vision;
1719

1820
#[cfg(feature = "cli")]

src/main.rs

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ async fn main() -> Result<()> {
2222
use gems::stream::StreamBuilder;
2323
use gems::tokens::TokenBuilder;
2424
use gems::traits::CTrait;
25+
use gems::tts::TtsGenBuilder;
26+
use gems::vidgen::VideoGenBuilder;
27+
2528
use gems::tui::run_tui;
2629
use gems::utils::{
2730
extract_text_from_partial_json, load_and_encode_image, type_with_cursor_effect,
@@ -183,7 +186,40 @@ async fn main() -> Result<()> {
183186

184187
let image_data = gemini_client.images().generate(params).await?;
185188

186-
std::fs::write("output.png", &image_data)?;
189+
tokio::fs::write("output.png", &image_data).await?;
190+
}
191+
Some(Command::Vidgen(cmd)) => {
192+
gemini_client.set_model(Model::Veo2);
193+
194+
let params = VideoGenBuilder::default()
195+
.model(Model::Veo2)
196+
.input(Message::User {
197+
content: Content::Text(cmd.text),
198+
name: None,
199+
})
200+
.build()
201+
.unwrap();
202+
203+
let bytes = gemini_client.videos().generate(params).await?;
204+
205+
tokio::fs::write("output.mp4", &bytes).await?;
206+
}
207+
Some(Command::Tts(cmd)) => {
208+
gemini_client.set_model(Model::Tts);
209+
210+
let params = TtsGenBuilder::default()
211+
.model(Model::Tts)
212+
.input(Message::User {
213+
content: Content::Text(cmd.text),
214+
name: None,
215+
})
216+
.voice(cmd.voice)
217+
.build()
218+
.unwrap();
219+
220+
let bytes = gemini_client.tts().generate(params).await?;
221+
222+
tokio::fs::write("output.pcm", &bytes).await?;
187223
}
188224
None => {
189225
let _ = run_tui().await;

src/models.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ pub enum Model {
1919
Embedding,
2020
Imagen3,
2121
Veo2,
22+
Tts,
2223
Flash20Live,
2324
FlashExpImage,
2425
}
@@ -36,6 +37,7 @@ impl ToString for Model {
3637
Model::Embedding => "text-embedding-004",
3738
Model::Imagen3 => "imagen-3.0-generate-002",
3839
Model::Veo2 => "veo-2.0-generate-001",
40+
Model::Tts => "gemini-2.5-flash-preview-tts",
3941
Model::Flash20Live => "gemini-2.0-flash-live-001",
4042
Model::FlashExpImage => "gemini-2.0-flash-exp-image-generation",
4143
}
@@ -57,6 +59,7 @@ impl FromStr for Model {
5759
"text-embedding-004" => Ok(Model::Embedding),
5860
"imagen-3.0-generate-002" => Ok(Model::Imagen3),
5961
"veo-2.0-generate-001" => Ok(Model::Veo2),
62+
"gemini-2.5-flash-preview-tts" => Ok(Model::Tts),
6063
"gemini-2.0-flash-live-001" => Ok(Model::Flash20Live),
6164
"gemini-2.0-flash-exp-image-generation" => Ok(Model::Flash20Live),
6265
_ => Err(anyhow!("Unknown model: {}", s)),

src/requests.rs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,62 @@ pub struct GenerationConfig {
8585
#[serde(rename = "responseModalities")]
8686
pub response_modalities: Vec<String>,
8787
}
88+
89+
/// Request payload for video generation using Veo.
90+
#[derive(Debug, Serialize)]
91+
pub struct VideoGenRequest {
92+
pub instances: Vec<VideoPrompt>,
93+
pub parameters: VideoParameters,
94+
}
95+
96+
/// The prompt inside the request.
97+
#[derive(Debug, Serialize)]
98+
pub struct VideoPrompt {
99+
pub prompt: String,
100+
}
101+
102+
/// Optional parameters for generation behavior.
103+
#[derive(Debug, Serialize)]
104+
pub struct VideoParameters {
105+
#[serde(rename = "aspectRatio")]
106+
pub aspect_ratio: String,
107+
108+
#[serde(rename = "personGeneration")]
109+
pub person_generation: String,
110+
}
111+
112+
#[derive(Debug, Serialize)]
113+
pub struct TtsRequest {
114+
pub model: String,
115+
pub contents: Vec<Content>,
116+
#[serde(rename = "generationConfig")]
117+
pub generation_config: TtsGenerationConfig,
118+
#[serde(skip_serializing_if = "Option::is_none")]
119+
pub system_instruction: Option<Content>,
120+
}
121+
122+
#[derive(Debug, Serialize)]
123+
pub struct TtsGenerationConfig {
124+
#[serde(rename = "responseModalities")]
125+
pub response_modalities: Vec<String>,
126+
#[serde(rename = "speechConfig")]
127+
pub speech_config: SpeechConfig,
128+
}
129+
130+
#[derive(Debug, Serialize)]
131+
pub struct SpeechConfig {
132+
#[serde(rename = "voiceConfig")]
133+
pub voice_config: VoiceConfig,
134+
}
135+
136+
#[derive(Debug, Serialize)]
137+
pub struct VoiceConfig {
138+
#[serde(rename = "prebuiltVoiceConfig")]
139+
pub prebuilt_voice_config: PrebuiltVoiceConfig,
140+
}
141+
142+
#[derive(Debug, Serialize)]
143+
pub struct PrebuiltVoiceConfig {
144+
#[serde(rename = "voiceName")]
145+
pub voice_name: String,
146+
}

src/responses.rs

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,13 +133,13 @@ pub struct ImagenResponse {
133133

134134
#[derive(Debug, Serialize, Deserialize)]
135135
#[serde(rename_all = "camelCase")]
136-
pub struct Candidate {
136+
pub struct TtsCandidate {
137137
pub content: Content,
138138
pub finish_reason: Option<String>,
139139
pub index: Option<i32>,
140140
}
141141

142-
#[derive(Debug, Serialize, Deserialize)]
142+
#[derive(Debug, Serialize, Deserialize, Clone)]
143143
#[serde(rename_all = "camelCase")]
144144
pub struct Content {
145145
pub parts: Vec<Part>,
@@ -156,6 +156,10 @@ pub enum Part {
156156
#[serde(rename = "inlineData")]
157157
inline_data: ImageContent,
158158
},
159+
Media {
160+
#[serde(rename = "inlineData")]
161+
inline_data: InlineData,
162+
},
159163
}
160164

161165
#[derive(Debug, Serialize, Deserialize, Clone)]
@@ -179,3 +183,72 @@ pub struct PromptTokenDetail {
179183
pub modality: Option<String>,
180184
pub token_count: Option<i32>,
181185
}
186+
187+
/// Response returned immediately after submitting the generation request.
188+
#[derive(Debug, Deserialize)]
189+
pub struct VideoGenResponse {
190+
pub name: Option<String>,
191+
}
192+
193+
/// Polling response to check operation status.
194+
#[derive(Debug, Deserialize)]
195+
pub struct OperationStatus {
196+
pub done: Option<bool>,
197+
pub error: Option<OperationError>,
198+
pub response: Option<OperationResponse>,
199+
}
200+
201+
/// Error details if the operation fails.
202+
#[derive(Debug, Deserialize)]
203+
pub struct OperationError {
204+
pub message: String,
205+
}
206+
207+
/// Successful operation result.
208+
#[derive(Debug, Deserialize)]
209+
pub struct OperationResponse {
210+
pub output: VideoOutput,
211+
}
212+
213+
/// Output payload containing the video.
214+
#[derive(Debug, Deserialize)]
215+
pub struct VideoOutput {
216+
pub video: EncodedVideo,
217+
}
218+
219+
/// The actual video content encoded in base64.
220+
#[derive(Debug, Deserialize)]
221+
pub struct EncodedVideo {
222+
#[serde(rename = "mimeType")]
223+
pub mime_type: String,
224+
225+
#[serde(rename = "base64Data")]
226+
pub base64_data: String,
227+
}
228+
229+
#[derive(Debug, Deserialize)]
230+
pub struct ErrorWrapper {
231+
pub error: ErrorMessage,
232+
}
233+
234+
#[derive(Debug, Deserialize)]
235+
pub struct ErrorMessage {
236+
pub message: String,
237+
}
238+
239+
#[derive(Debug, Deserialize)]
240+
pub struct TtsResponse {
241+
pub candidates: Option<Vec<Candidate>>,
242+
}
243+
244+
#[derive(Debug, Deserialize, Serialize, Clone)]
245+
pub struct Candidate {
246+
pub content: Content,
247+
}
248+
249+
#[derive(Debug, Deserialize, Serialize, Clone)]
250+
#[serde(rename_all = "camelCase")]
251+
pub struct InlineData {
252+
pub mime_type: String,
253+
pub data: String,
254+
}

src/traits.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ use crate::models::Model;
55
use crate::models::Models;
66
use crate::stream::Streaming;
77
use crate::tokens::Tokens;
8+
use crate::tts::Tts;
9+
use crate::vidgen::Videos;
810
use crate::vision::Visions;
911
use anyhow::Result;
1012
use reqwest::{Method, RequestBuilder};
@@ -23,4 +25,6 @@ pub trait CTrait {
2325
fn stream(&self) -> Streaming;
2426
fn models(&self) -> Models;
2527
fn images(&self) -> Images;
28+
fn videos(&self) -> Videos;
29+
fn tts(&self) -> Tts;
2630
}

0 commit comments

Comments
 (0)