Skip to content

Commit 3564576

Browse files
committed
feed the reasoning tokens back through the agentic loop to improve results
1 parent f130468 commit 3564576

File tree

3 files changed

+10
-47
lines changed

3 files changed

+10
-47
lines changed

tools/agentic_mcp_evaluation/mcp_evaluation.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import traceback
2121
import xml.etree.ElementTree as ET #nosec B405: file is local and trusted within repo
2222
from pathlib import Path
23-
from typing import Any, Dict, List
23+
from typing import Any, Dict, List, Optional
2424

2525
from fastmcp.exceptions import ToolError
2626
from fastmcp.tools.tool import ToolResult
@@ -167,9 +167,6 @@ async def setup_mcp_server(self) -> None:
167167
async def _extract_tool_schemas(self) -> List[Dict[str, Any]]:
168168
"""Extract JSON schemas for all tools in the MCP server."""
169169
tools = []
170-
171-
# Debug: Print what's available in the server
172-
173170
# Get tools from the FastMCP instance
174171
for tool_name, tool_info in (await self.server.mcp.get_tools()).items():
175172
logger.info(f"🔍 Tool: {tool_name} - {tool_info.description}")
@@ -210,7 +207,7 @@ async def _call_mcp_tool(self, tool_name: str, **kwargs) -> MCPResultSet:
210207
return f"Tool {tool_name} failed: {formatted_error_list}"
211208

212209

213-
async def agent_loop(self, prompt: str) -> AgentLoopResult:
210+
async def agent_loop(self, prompt: str) -> Optional[AgentLoopResult]:
214211
"""Simplified agent loop using OpenAI Chat Completions with tool calling."""
215212
messages: List[Dict[str, Any]] = [
216213
{"role": "system", "content": AGENT_PROMPT},
@@ -225,14 +222,17 @@ async def agent_loop(self, prompt: str) -> AgentLoopResult:
225222
model=self.agent_model,
226223
messages=messages,
227224
tools=self.tools if self.tools else None,
225+
reasoning_effort="low",
228226
# tool_choice="auto",
229227
extra_body={
228+
# Preserve usage metrics
230229
"usage": {
231230
"include": True
232231
},
232+
# Ensure providers validate tool params
233233
"provider" : {
234234
"require_parameters" : True
235-
}
235+
},
236236
},
237237
)
238238
usage = resp.usage
@@ -248,18 +248,9 @@ async def agent_loop(self, prompt: str) -> AgentLoopResult:
248248
agent_usage += response_metrics
249249
choice = resp.choices[0]
250250
msg = choice.message
251-
251+
messages.append(msg.to_dict())
252252
# If there are tool calls, execute them and loop
253253
if getattr(msg, "tool_calls", None):
254-
# Append assistant message with tool calls
255-
messages.append(
256-
{
257-
"role": "assistant",
258-
"content": msg.content or "",
259-
"tool_calls": [tc.model_dump() for tc in msg.tool_calls],
260-
}
261-
)
262-
263254
for tc in msg.tool_calls:
264255
tool_name = tc.function.name
265256
try:

tools/agentic_mcp_evaluation/mcp_evaluation_tasks.xml

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,44 +15,17 @@
1515
<task>
1616
<prompt>Which neighborhoods have the highest share of users seeking a Long-term relationship? Rank by percentage.</prompt>
1717
</task>
18-
<task>
19-
<prompt>Show the top 10 hobbies among users aged 25–35 in Manhattan, with counts.</prompt>
20-
</task>
2118
<task>
2219
<prompt>Break down gender distribution by 3-year age buckets (18–20, 21–23, …) across all profiles.</prompt>
2320
</task>
2421
<task>
2522
<prompt>Among profiles that mention pets, which locations have the highest concentration of pet owners?</prompt>
2623
</task>
2724
<task>
28-
<prompt>List all conversations flagged as bad actors and summarize counts by primary_concern and behavior_severity.</prompt>
29-
</task>
30-
<task>
31-
<prompt>For moderation reports where escalation_observed = true, what are the most common primary_concern values and recommended_action outcomes?</prompt>
32-
</task>
33-
<!-- <task>
3425
<prompt>Identify repeat offenders: which user IDs appear most often as the primary_bad_actor? Return top 10 with counts and common concerns.</prompt>
3526
</task>
36-
<task>
37-
<prompt>What recommended_action is most frequently applied per risk category (e.g., harassment_risk, scam_fraud_risk)? Show a table mapping risk → top action with counts.</prompt>
38-
</task>
39-
<task>
40-
<prompt>Find profiles whose ideal_partner description includes the word "cat" and who are located in Manhattan. Return profile_id, full_name, location, pets.</prompt>
41-
</task>
42-
<task>
43-
<prompt>Compare the distribution of "looking_for" categories across Manhattan, Brooklyn, and Queens. Show counts by borough.</prompt>
44-
</task>
45-
<task>
46-
<prompt>From the moderation_report, list the top 5 conversations with harassment_risk = true ordered by behavior_severity, including conversation_summary (limit length) and recommended_action.</prompt>
47-
</task>
48-
<task>
49-
<prompt>Compute the proportion of moderation reports with any risk flag set (at least one risk=true) that also have conversation_flagged_as_bad_actor = true.</prompt>
50-
</task>
51-
<task>
52-
<prompt>For users aged 26–32 looking for a Long-term relationship, which occupations are most common? Return top 10 with counts.</prompt>
53-
</task>
5427
<task>
5528
<prompt>Are there patterns between behavior_severity and recommended_action? Provide counts for each (behavior_severity, recommended_action) pair.</prompt>
56-
</task> -->
29+
</task>
5730
</evaluation>
5831

tools/agentic_mcp_evaluation/mcp_to_evaluate.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,9 @@ def setup_mcp_for_evaluation(local_session: fc.Session) -> FenicMCPServer:
5050
return fc.create_mcp_server(
5151
local_session,
5252
"Dating App Moderation Demo",
53-
# dynamic_tools=[semantic_profile_search, user_activity_report],
54-
automated_tool_generation=fc.ToolGenerationConfig(
53+
system_tools=fc.SystemToolConfig(
5554
table_names=["conversations", "enriched_profiles", "moderation_report"],
56-
tool_group_name="Dating App",
55+
tool_namespace="Dating App",
5756
max_result_rows=100
5857
)
5958
)

0 commit comments

Comments
 (0)