feed the reasoning tokens back through the agentic loop to improve results

bcallender · bcallender · commit 35645765dde6 · 2025-11-17T11:23:23.000-08:00
diff --git a/tools/agentic_mcp_evaluation/mcp_evaluation.py b/tools/agentic_mcp_evaluation/mcp_evaluation.py
@@ -20,7 +20,7 @@
 import traceback
 import xml.etree.ElementTree as ET  #nosec B405: file is local and trusted within repo
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from fastmcp.exceptions import ToolError
 from fastmcp.tools.tool import ToolResult
@@ -167,9 +167,6 @@ async def setup_mcp_server(self) -> None:
     async def _extract_tool_schemas(self) -> List[Dict[str, Any]]:
         """Extract JSON schemas for all tools in the MCP server."""
         tools = []
-
-        # Debug: Print what's available in the server
-
         # Get tools from the FastMCP instance
         for tool_name, tool_info in (await self.server.mcp.get_tools()).items():
             logger.info(f"🔍 Tool: {tool_name} - {tool_info.description}")
@@ -210,7 +207,7 @@ async def _call_mcp_tool(self, tool_name: str, **kwargs) -> MCPResultSet:
             return f"Tool {tool_name} failed: {formatted_error_list}"
 
 
-    async def agent_loop(self, prompt: str) -> AgentLoopResult:
+    async def agent_loop(self, prompt: str) -> Optional[AgentLoopResult]:
         """Simplified agent loop using OpenAI Chat Completions with tool calling."""
         messages: List[Dict[str, Any]] = [
             {"role": "system", "content": AGENT_PROMPT},
@@ -225,14 +222,17 @@ async def agent_loop(self, prompt: str) -> AgentLoopResult:
                 model=self.agent_model,
                 messages=messages,
                 tools=self.tools if self.tools else None,
+                reasoning_effort="low",
                 # tool_choice="auto",
                 extra_body={
+                    # Preserve usage metrics
                     "usage": {
                         "include": True
                     },  
+                    # Ensure providers validate tool params
                     "provider" : {
                             "require_parameters" : True
-                        }
+                        },
                 },
             )
             usage = resp.usage
@@ -248,18 +248,9 @@ async def agent_loop(self, prompt: str) -> AgentLoopResult:
             agent_usage += response_metrics
             choice = resp.choices[0]
             msg = choice.message
-
+            messages.append(msg.to_dict())
             # If there are tool calls, execute them and loop
             if getattr(msg, "tool_calls", None):
-                # Append assistant message with tool calls
-                messages.append(
-                    {
-                        "role": "assistant",
-                        "content": msg.content or "",
-                        "tool_calls": [tc.model_dump() for tc in msg.tool_calls],
-                    }
-                )
-
                 for tc in msg.tool_calls:
                     tool_name = tc.function.name
                     try:
diff --git a/tools/agentic_mcp_evaluation/mcp_evaluation_tasks.xml b/tools/agentic_mcp_evaluation/mcp_evaluation_tasks.xml
@@ -15,44 +15,17 @@
     <task>
         <prompt>Which neighborhoods have the highest share of users seeking a Long-term relationship? Rank by percentage.</prompt>
     </task>
-    <task>
-        <prompt>Show the top 10 hobbies among users aged 25–35 in Manhattan, with counts.</prompt>
-    </task>
     <task>
         <prompt>Break down gender distribution by 3-year age buckets (18–20, 21–23, …) across all profiles.</prompt>
     </task>
     <task>
         <prompt>Among profiles that mention pets, which locations have the highest concentration of pet owners?</prompt>
     </task>
     <task>
-        <prompt>List all conversations flagged as bad actors and summarize counts by primary_concern and behavior_severity.</prompt>
-    </task>
-    <task>
-        <prompt>For moderation reports where escalation_observed = true, what are the most common primary_concern values and recommended_action outcomes?</prompt>
-    </task>
-    <!-- <task>
         <prompt>Identify repeat offenders: which user IDs appear most often as the primary_bad_actor? Return top 10 with counts and common concerns.</prompt>
     </task>
-    <task>
-        <prompt>What recommended_action is most frequently applied per risk category (e.g., harassment_risk, scam_fraud_risk)? Show a table mapping risk → top action with counts.</prompt>
-    </task>
-    <task>
-        <prompt>Find profiles whose ideal_partner description includes the word "cat" and who are located in Manhattan. Return profile_id, full_name, location, pets.</prompt>
-    </task>
-    <task>
-        <prompt>Compare the distribution of "looking_for" categories across Manhattan, Brooklyn, and Queens. Show counts by borough.</prompt>
-    </task>
-    <task>
-        <prompt>From the moderation_report, list the top 5 conversations with harassment_risk = true ordered by behavior_severity, including conversation_summary (limit length) and recommended_action.</prompt>
-    </task>
-    <task>
-        <prompt>Compute the proportion of moderation reports with any risk flag set (at least one risk=true) that also have conversation_flagged_as_bad_actor = true.</prompt>
-    </task>
-    <task>
-        <prompt>For users aged 26–32 looking for a Long-term relationship, which occupations are most common? Return top 10 with counts.</prompt>
-    </task>
     <task>
         <prompt>Are there patterns between behavior_severity and recommended_action? Provide counts for each (behavior_severity, recommended_action) pair.</prompt>
-    </task> -->
+    </task>
 </evaluation>
 
diff --git a/tools/agentic_mcp_evaluation/mcp_to_evaluate.py b/tools/agentic_mcp_evaluation/mcp_to_evaluate.py
@@ -50,10 +50,9 @@ def setup_mcp_for_evaluation(local_session: fc.Session) -> FenicMCPServer:
     return fc.create_mcp_server(
         local_session,
         "Dating App Moderation Demo",
-        # dynamic_tools=[semantic_profile_search, user_activity_report],
-        automated_tool_generation=fc.ToolGenerationConfig(
+        system_tools=fc.SystemToolConfig(
             table_names=["conversations", "enriched_profiles", "moderation_report"],
-            tool_group_name="Dating App",
+            tool_namespace="Dating App",
             max_result_rows=100
         )
     )

Original file line number	Diff line number	Diff line change
`@@ -50,10 +50,9 @@ def setup_mcp_for_evaluation(local_session: fc.Session) -> FenicMCPServer:`
`50`	`50`	`return fc.create_mcp_server(`
`51`	`51`	`local_session,`
`52`	`52`	`"Dating App Moderation Demo",`
`53`		`- # dynamic_tools=[semantic_profile_search, user_activity_report],`
`54`		`- automated_tool_generation=fc.ToolGenerationConfig(`
	`53`	`+ system_tools=fc.SystemToolConfig(`
`55`	`54`	`table_names=["conversations", "enriched_profiles", "moderation_report"],`
`56`		`- tool_group_name="Dating App",`
	`55`	`+ tool_namespace="Dating App",`
`57`	`56`	`max_result_rows=100`
`58`	`57`	`)`
`59`	`58`	`)`