Skip to content

Commit e9b721d

Browse files
authored
Merge pull request #880 from hchen2020/master
Add audio transcript
2 parents 0fda49d + 7fa6bc1 commit e9b721d

File tree

10 files changed

+200
-64
lines changed

10 files changed

+200
-64
lines changed

src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,19 @@ Task Connect(RealtimeHubConnection conn,
1414
Action<string> onModelAudioDeltaReceived,
1515
Action onModelAudioResponseDone,
1616
Action<string> onAudioTranscriptDone,
17-
Action<string> onModelResponseDone,
17+
Action<List<RoleDialogModel>> onModelResponseDone,
18+
Action<string> onConversationItemCreated,
19+
Action<RoleDialogModel> onInputAudioTranscriptionCompleted,
1820
Action onUserInterrupted);
1921
Task AppenAudioBuffer(string message);
2022

2123
Task SendEventToModel(object message);
2224
Task Disconnect();
2325

2426
Task<RealtimeSession> CreateSession(Agent agent, List<RoleDialogModel> conversations);
25-
Task<string> UpdateInitialSession(RealtimeHubConnection conn);
26-
Task<string> InsertConversationItem(RoleDialogModel message);
27+
Task UpdateInitialSession(RealtimeHubConnection conn);
28+
Task InsertConversationItem(RoleDialogModel message);
2729
Task TriggerModelInference(string? instructions = null);
2830
Task<List<RoleDialogModel>> OnResponsedDone(RealtimeHubConnection conn, string response);
31+
Task<RoleDialogModel> OnConversationItemCreated(RealtimeHubConnection conn, string response);
2932
}

src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ public class RealtimeHubConnection
44
{
55
public string Event { get; set; } = null!;
66
public string StreamId { get; set; } = null!;
7+
public string EntryAgentId { get; set; } = null!;
78
public string ConversationId { get; set; } = null!;
89
public string Data { get; set; } = string.Empty;
910
public string Model { get; set; } = null!;

src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,15 @@ private async Task ConnectToModel(IRealTimeCompletion completer, WebSocket userW
6464
{
6565
var hookProvider = _services.GetRequiredService<ConversationHookProvider>();
6666
var storage = _services.GetRequiredService<IConversationStorage>();
67+
6768
var convService = _services.GetRequiredService<IConversationService>();
6869
convService.SetConversationId(conn.ConversationId, []);
6970
var conversation = await convService.GetConversation(conn.ConversationId);
71+
7072
var agentService = _services.GetRequiredService<IAgentService>();
7173
var agent = await agentService.LoadAgent(conversation.AgentId);
74+
conn.EntryAgentId = agent.Id;
75+
7276
var routing = _services.GetRequiredService<IRoutingService>();
7377
var dialogs = convService.GetDialogHistory();
7478
routing.Context.SetDialogs(dialogs);
@@ -77,19 +81,18 @@ await completer.Connect(conn,
7781
onModelReady: async () =>
7882
{
7983
// Control initial session
80-
var data = await completer.UpdateInitialSession(conn);
81-
await completer.SendEventToModel(data);
84+
await completer.UpdateInitialSession(conn);
85+
8286

8387
// Add dialog history
8488
foreach (var item in dialogs)
8589
{
86-
var dialogItem = await completer.InsertConversationItem(item);
87-
await completer.SendEventToModel(data);
90+
await completer.InsertConversationItem(item);
8891
}
8992

9093
if (dialogs.LastOrDefault()?.Role == AgentRole.Assistant)
9194
{
92-
await completer.TriggerModelInference($"Rephase your last response:\r\n{dialogs.LastOrDefault()?.Content}");
95+
// await completer.TriggerModelInference($"Rephase your last response:\r\n{dialogs.LastOrDefault()?.Content}");
9396
}
9497
else
9598
{
@@ -108,37 +111,49 @@ await completer.Connect(conn,
108111
},
109112
onAudioTranscriptDone: async transcript =>
110113
{
111-
var message = new RoleDialogModel(AgentRole.Assistant, transcript);
112-
113-
// append transcript to conversation
114-
storage.Append(conn.ConversationId, message);
115-
116-
foreach (var hook in hookProvider.HooksOrderByPriority)
117-
{
118-
hook.SetAgent(agent)
119-
.SetConversation(conversation);
120114

121-
if (!string.IsNullOrEmpty(transcript))
122-
{
123-
await hook.OnMessageReceived(message);
124-
}
125-
}
126115
},
127-
onModelResponseDone: async response =>
116+
onModelResponseDone: async messages =>
128117
{
129-
var messages = await completer.OnResponsedDone(conn, response);
130118
foreach (var message in messages)
131119
{
132120
// Invoke function
133-
if (message.FunctionName != null)
121+
if (message.MessageType == "function_call")
134122
{
135123
await routing.InvokeFunction(message.FunctionName, message);
136-
var data = await completer.InsertConversationItem(message);
137-
await completer.SendEventToModel(data);
124+
message.Role = AgentRole.Function;
125+
await completer.InsertConversationItem(message);
138126
await completer.TriggerModelInference("Reply based on the function's output.");
139127
}
128+
else
129+
{
130+
// append transcript to conversation
131+
storage.Append(conn.ConversationId, message);
132+
dialogs.Add(message);
133+
134+
foreach (var hook in hookProvider.HooksOrderByPriority)
135+
{
136+
hook.SetAgent(agent)
137+
.SetConversation(conversation);
138+
139+
if (!string.IsNullOrEmpty(message.Content))
140+
{
141+
await hook.OnMessageReceived(message);
142+
}
143+
}
144+
}
140145
}
141146
},
147+
onConversationItemCreated: async response =>
148+
{
149+
150+
},
151+
onInputAudioTranscriptionCompleted: async message =>
152+
{
153+
// append transcript to conversation
154+
storage.Append(conn.ConversationId, message);
155+
dialogs.Add(message);
156+
},
142157
onUserInterrupted: async () =>
143158
{
144159
var data = conn.OnModelUserInterrupted();

src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public async Task<bool> InvokeFunction(string name, RoleDialogModel message)
4949
}
5050

5151
// Set result to original message
52-
message.Role = AgentRole.Function;
52+
message.Role = clonedMessage.Role;
5353
message.PostbackFunctionName = clonedMessage.PostbackFunctionName;
5454
message.CurrentAgentId = clonedMessage.CurrentAgentId;
5555
message.Content = clonedMessage.Content;
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
namespace BotSharp.Plugin.OpenAI.Models.Realtime;
2+
3+
public class ConversationItemCreated : ServerEventResponse
4+
{
5+
[JsonPropertyName("item")]
6+
public ConversationItemBody Item { get; set; } = new();
7+
}
8+
9+
public class ConversationItemBody
10+
{
11+
[JsonPropertyName("id")]
12+
public string Id { get; set; } = null!;
13+
[JsonPropertyName("type")]
14+
public string Type { get; set; } = null!;
15+
16+
[JsonPropertyName("role")]
17+
public string Role { get; set;} = null!;
18+
19+
[JsonPropertyName("content")]
20+
public ConversationItemContent[] Content { get; set; } = [];
21+
}
22+
23+
public class ConversationItemContent
24+
{
25+
[JsonPropertyName("type")]
26+
public string Type { get; set; } = null!;
27+
28+
[JsonPropertyName("transcript")]
29+
public string Transcript { get; set; } = null!;
30+
31+
[JsonPropertyName("audio")]
32+
public string Audio { get; set; } = null!;
33+
}

src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ public class RealtimeSessionBody
2828
[JsonPropertyName("output_audio_format")]
2929
public string OutputAudioFormat { get; set; } = "pcm16";
3030

31+
[JsonPropertyName("input_audio_transcription")]
32+
public InputAudioTranscription InputAudioTranscription { get; set; } = new();
33+
3134
[JsonPropertyName("instructions")]
3235
public string Instructions { get; set; } = "You are a friendly assistant.";
3336

@@ -63,4 +66,10 @@ public class RealtimeSessionTurnDetection
6366

6467
[JsonPropertyName("type")]
6568
public string Type { get; set; } = "server_vad";
69+
}
70+
71+
public class InputAudioTranscription
72+
{
73+
[JsonPropertyName("model")]
74+
public string Model { get; set; } = null!;
6675
}

src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs

Lines changed: 87 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ public async Task Connect(RealtimeHubConnection conn,
4040
Action<string> onModelAudioDeltaReceived,
4141
Action onModelAudioResponseDone,
4242
Action<string> onAudioTranscriptDone,
43-
Action<string> onModelResponseDone,
43+
Action<List<RoleDialogModel>> onModelResponseDone,
44+
Action<string> onConversationItemCreated,
45+
Action<RoleDialogModel> onInputAudioTranscriptionCompleted,
4446
Action onUserInterrupted)
4547
{
4648
var settingsService = _services.GetRequiredService<ILlmProviderService>();
@@ -57,10 +59,13 @@ public async Task Connect(RealtimeHubConnection conn,
5759
onModelReady();
5860

5961
// Receive a message
60-
_ = ReceiveMessage(onModelAudioDeltaReceived,
62+
_ = ReceiveMessage(conn,
63+
onModelAudioDeltaReceived,
6164
onModelAudioResponseDone,
6265
onAudioTranscriptDone,
6366
onModelResponseDone,
67+
onConversationItemCreated,
68+
onInputAudioTranscriptionCompleted,
6469
onUserInterrupted);
6570
}
6671
}
@@ -94,10 +99,13 @@ await SendEventToModel(new
9499
});
95100
}
96101

97-
private async Task ReceiveMessage(Action<string> onModelAudioDeltaReceived,
102+
private async Task ReceiveMessage(RealtimeHubConnection conn,
103+
Action<string> onModelAudioDeltaReceived,
98104
Action onModelAudioResponseDone,
99105
Action<string> onAudioTranscriptDone,
100-
Action<string> onModelResponseDone,
106+
Action<List<RoleDialogModel>> onModelResponseDone,
107+
Action<string> onConversationItemCreated,
108+
Action<RoleDialogModel> onInputAudioTranscriptionCompleted,
101109
Action onUserInterrupted)
102110
{
103111
var buffer = new byte[1024 * 1024 * 1];
@@ -158,7 +166,20 @@ private async Task ReceiveMessage(Action<string> onModelAudioDeltaReceived,
158166
else if (response.Type == "response.done")
159167
{
160168
_logger.LogInformation($"{response.Type}: {receivedText}");
161-
onModelResponseDone(receivedText);
169+
await Task.Delay(1000);
170+
var messages = await OnResponsedDone(conn, receivedText);
171+
onModelResponseDone(messages);
172+
}
173+
else if (response.Type == "conversation.item.created")
174+
{
175+
_logger.LogInformation($"{response.Type}: {receivedText}");
176+
onConversationItemCreated(receivedText);
177+
}
178+
else if (response.Type == "conversation.item.input_audio_transcription.completed")
179+
{
180+
_logger.LogInformation($"{response.Type}: {receivedText}");
181+
var message = await OnInputAudioTranscriptionCompleted(conn, receivedText);
182+
onInputAudioTranscriptionCompleted(message);
162183
}
163184
else if (response.Type == "input_audio_buffer.speech_started")
164185
{
@@ -226,7 +247,7 @@ public async Task<RealtimeSession> CreateSession(Agent agent, List<RoleDialogMod
226247
return session;
227248
}
228249

229-
public async Task<string> UpdateInitialSession(RealtimeHubConnection conn)
250+
public async Task UpdateInitialSession(RealtimeHubConnection conn)
230251
{
231252
var convService = _services.GetRequiredService<IConversationService>();
232253
var conv = await convService.GetConversation(conn.ConversationId);
@@ -247,6 +268,10 @@ public async Task<string> UpdateInitialSession(RealtimeHubConnection conn)
247268
{
248269
InputAudioFormat = "g711_ulaw",
249270
OutputAudioFormat = "g711_ulaw",
271+
InputAudioTranscription = new InputAudioTranscription
272+
{
273+
Model = "whisper-1",
274+
},
250275
Voice = "alloy",
251276
Instructions = instruction,
252277
ToolChoice = "auto",
@@ -265,10 +290,10 @@ public async Task<string> UpdateInitialSession(RealtimeHubConnection conn)
265290
}
266291
};
267292

268-
return JsonSerializer.Serialize(sessionUpdate);
293+
await SendEventToModel(sessionUpdate);
269294
}
270295

271-
public async Task<string> InsertConversationItem(RoleDialogModel message)
296+
public async Task InsertConversationItem(RoleDialogModel message)
272297
{
273298
if (message.Role == AgentRole.Function)
274299
{
@@ -282,10 +307,10 @@ public async Task<string> InsertConversationItem(RoleDialogModel message)
282307
output = message.Content
283308
}
284309
};
285-
return JsonSerializer.Serialize(functionConversationItem);
310+
311+
await SendEventToModel(functionConversationItem);
286312
}
287-
else if (message.Role == AgentRole.User ||
288-
message.Role == AgentRole.Assistant)
313+
else if (message.Role == AgentRole.Assistant)
289314
{
290315
var conversationItem = new
291316
{
@@ -305,7 +330,29 @@ public async Task<string> InsertConversationItem(RoleDialogModel message)
305330
}
306331
};
307332

308-
return JsonSerializer.Serialize(conversationItem);
333+
await SendEventToModel(conversationItem);
334+
}
335+
else if (message.Role == AgentRole.User)
336+
{
337+
var conversationItem = new
338+
{
339+
type = "conversation.item.create",
340+
item = new
341+
{
342+
type = "message",
343+
role = message.Role,
344+
content = new object[]
345+
{
346+
new
347+
{
348+
type = "input_text",
349+
text = message.Content
350+
}
351+
}
352+
}
353+
};
354+
355+
await SendEventToModel(conversationItem);
309356
}
310357
else
311358
{
@@ -507,16 +554,42 @@ public async Task<List<RoleDialogModel>> OnResponsedDone(RealtimeHubConnection c
507554
{
508555
if (output.Type == "function_call")
509556
{
510-
outputs.Add(new RoleDialogModel(AgentRole.Assistant, output.Arguments)
557+
outputs.Add(new RoleDialogModel(output.Role, output.Arguments)
511558
{
559+
CurrentAgentId = conn.EntryAgentId,
512560
FunctionName = output.Name,
513561
FunctionArgs = output.Arguments,
514-
MessageType = output.Type,
515562
ToolCallId = output.CallId
516563
});
517564
}
565+
else if (output.Type == "message")
566+
{
567+
var content = output.Content.FirstOrDefault();
568+
569+
outputs.Add(new RoleDialogModel(output.Role, content.Transcript)
570+
{
571+
CurrentAgentId = conn.EntryAgentId
572+
});
573+
}
518574
}
519575

520576
return outputs;
521577
}
578+
579+
public async Task<RoleDialogModel> OnInputAudioTranscriptionCompleted(RealtimeHubConnection conn, string response)
580+
{
581+
var data = JsonSerializer.Deserialize<ResponseAudioTranscript>(response);
582+
return new RoleDialogModel(AgentRole.User, data.Transcript)
583+
{
584+
CurrentAgentId = conn.EntryAgentId
585+
};
586+
}
587+
588+
public async Task<RoleDialogModel> OnConversationItemCreated(RealtimeHubConnection conn, string response)
589+
{
590+
var item = JsonSerializer.Deserialize<ConversationItemCreated>(response).Item;
591+
var message = new RoleDialogModel(item.Role, item.Content.FirstOrDefault()?.Transcript);
592+
593+
return message;
594+
}
522595
}

0 commit comments

Comments
 (0)