Skip to content

Commit 850efd9

Browse files
committed
Clearing response completeness and task navigation efficiency output
1 parent b7c299c commit 850efd9

File tree

2 files changed

+33
-339
lines changed

2 files changed

+33
-339
lines changed

sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/response_completeness.ipynb

Lines changed: 12 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,6 @@
7878
"response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=model_config)\n"
7979
]
8080
},
81-
{
82-
"cell_type": "code",
83-
"execution_count": null,
84-
"metadata": {},
85-
"outputs": [],
86-
"source": [
87-
"from azure.ai.evaluation import ResponseCompletenessEvaluator , AzureOpenAIModelConfiguration\n",
88-
"from pprint import pprint\n"
89-
]
90-
},
9181
{
9282
"cell_type": "markdown",
9383
"metadata": {},
@@ -104,30 +94,9 @@
10494
},
10595
{
10696
"cell_type": "code",
107-
"execution_count": 15,
97+
"execution_count": null,
10898
"metadata": {},
109-
"outputs": [
110-
{
111-
"data": {
112-
"text/plain": [
113-
"{'response_completeness': 1,\n",
114-
" 'response_completeness_result': 'fail',\n",
115-
" 'response_completeness_threshold': 3,\n",
116-
" 'response_completeness_reason': 'The response does not provide the answer (\"Tokyo\") and is missing all relevant information from the ground truth.',\n",
117-
" 'response_completeness_prompt_tokens': 1354,\n",
118-
" 'response_completeness_completion_tokens': 107,\n",
119-
" 'response_completeness_total_tokens': 1461,\n",
120-
" 'response_completeness_finish_reason': 'stop',\n",
121-
" 'response_completeness_model': 'gpt-4.1-2025-04-14',\n",
122-
" 'response_completeness_sample_input': '[{\"role\": \"user\", \"content\": \"{\\\\\"response\\\\\": \\\\\"The capital of Japan\\\\\", \\\\\"ground_truth\\\\\": \\\\\"The capital of Japan is Tokyo.\\\\\"}\"}]',\n",
123-
" 'response_completeness_sample_output': '[{\"role\": \"assistant\", \"content\": \"<S0>Let\\'s think step by step: The ground truth states \\\\\"The capital of Japan is Tokyo.\\\\\" The response is \\\\\"The capital of Japan.\\\\\" The response does not specify what the capital is; it only repeats part of the question and omits the key information (\\\\\"Tokyo\\\\\"). Therefore, none of the necessary information is present in the response.</S0>\\\\n<S1>The response does not provide the answer (\\\\\"Tokyo\\\\\") and is missing all relevant information from the ground truth.</S1>\\\\n<S2>1</S2>\"}]'}"
124-
]
125-
},
126-
"execution_count": 15,
127-
"metadata": {},
128-
"output_type": "execute_result"
129-
}
130-
],
99+
"outputs": [],
131100
"source": [
132101
"result = response_completeness_evaluator(\n",
133102
" response=\"The capital of Japan\",\n",
@@ -138,30 +107,9 @@
138107
},
139108
{
140109
"cell_type": "code",
141-
"execution_count": 16,
110+
"execution_count": null,
142111
"metadata": {},
143-
"outputs": [
144-
{
145-
"data": {
146-
"text/plain": [
147-
"{'response_completeness': 5,\n",
148-
" 'response_completeness_result': 'pass',\n",
149-
" 'response_completeness_threshold': 3,\n",
150-
" 'response_completeness_reason': 'The response is fully complete as it perfectly matches the ground truth statement.',\n",
151-
" 'response_completeness_prompt_tokens': 1356,\n",
152-
" 'response_completeness_completion_tokens': 85,\n",
153-
" 'response_completeness_total_tokens': 1441,\n",
154-
" 'response_completeness_finish_reason': 'stop',\n",
155-
" 'response_completeness_model': 'gpt-4.1-2025-04-14',\n",
156-
" 'response_completeness_sample_input': '[{\"role\": \"user\", \"content\": \"{\\\\\"response\\\\\": \\\\\"The capital of Japan is Tokyo.\\\\\", \\\\\"ground_truth\\\\\": \\\\\"The capital of Japan is Tokyo.\\\\\"}\"}]',\n",
157-
" 'response_completeness_sample_output': '[{\"role\": \"assistant\", \"content\": \"<S0>Let\\'s think step by step: The ground truth contains a single statement: \\\\\"The capital of Japan is Tokyo.\\\\\" The response exactly matches this statement without omitting or altering any information. There are no missing or incorrect details, and all relevant information is included.</S0>\\\\n<S1>The response is fully complete as it perfectly matches the ground truth statement.</S1>\\\\n<S2>5</S2>\"}]'}"
158-
]
159-
},
160-
"execution_count": 16,
161-
"metadata": {},
162-
"output_type": "execute_result"
163-
}
164-
],
112+
"outputs": [],
165113
"source": [
166114
"result = response_completeness_evaluator(\n",
167115
" response=\"The capital of Japan is Tokyo.\",\n",
@@ -179,30 +127,9 @@
179127
},
180128
{
181129
"cell_type": "code",
182-
"execution_count": 17,
130+
"execution_count": null,
183131
"metadata": {},
184-
"outputs": [
185-
{
186-
"data": {
187-
"text/plain": [
188-
"{'response_completeness': 5,\n",
189-
" 'response_completeness_result': 'pass',\n",
190-
" 'response_completeness_threshold': 3,\n",
191-
" 'response_completeness_reason': 'The response perfectly matches the ground truth and includes all necessary information.',\n",
192-
" 'response_completeness_prompt_tokens': 1356,\n",
193-
" 'response_completeness_completion_tokens': 95,\n",
194-
" 'response_completeness_total_tokens': 1451,\n",
195-
" 'response_completeness_finish_reason': 'stop',\n",
196-
" 'response_completeness_model': 'gpt-4.1-2025-04-14',\n",
197-
" 'response_completeness_sample_input': '[{\"role\": \"user\", \"content\": \"{\\\\\"response\\\\\": \\\\\"The capital of Japan is Tokyo.\\\\\", \\\\\"ground_truth\\\\\": \\\\\"The capital of Japan is Tokyo.\\\\\"}\"}]',\n",
198-
" 'response_completeness_sample_output': '[{\"role\": \"assistant\", \"content\": \"<S0>Let\\'s think step by step: The ground truth contains a single statement: \\\\\"The capital of Japan is Tokyo.\\\\\" The response exactly matches this statement without missing any information or introducing errors. There are no additional claims or details in the ground truth that need to be included, and the response is fully accurate and complete.</S0>\\\\n<S1>The response perfectly matches the ground truth and includes all necessary information.</S1>\\\\n<S2>5</S2>\"}]'}"
199-
]
200-
},
201-
"execution_count": 17,
202-
"metadata": {},
203-
"output_type": "execute_result"
204-
}
205-
],
132+
"outputs": [],
206133
"source": [
207134
"from azure.ai.evaluation import ResponseCompletenessEvaluator , AzureOpenAIModelConfiguration\n",
208135
"from pprint import pprint\n",
@@ -227,30 +154,9 @@
227154
},
228155
{
229156
"cell_type": "code",
230-
"execution_count": 18,
157+
"execution_count": null,
231158
"metadata": {},
232-
"outputs": [
233-
{
234-
"data": {
235-
"text/plain": [
236-
"{'response_completeness': 3,\n",
237-
" 'response_completeness_result': 'pass',\n",
238-
" 'response_completeness_threshold': 3,\n",
239-
" 'response_completeness_reason': 'The response provides only the Saturday forecast and omits the Sunday temperature and rain chance, so it is moderately complete.',\n",
240-
" 'response_completeness_prompt_tokens': 1398,\n",
241-
" 'response_completeness_completion_tokens': 150,\n",
242-
" 'response_completeness_total_tokens': 1548,\n",
243-
" 'response_completeness_finish_reason': 'stop',\n",
244-
" 'response_completeness_model': 'gpt-4.1-2025-04-14',\n",
245-
" 'response_completeness_sample_input': '[{\"role\": \"user\", \"content\": \"{\\\\\"response\\\\\": \\\\\"The weather in Seattle this weekend will be partly cloudy with temperatures around 15\\\\\\\\u00b0C on Saturday.\\\\\", \\\\\"ground_truth\\\\\": \\\\\"The weather in Seattle this weekend will be partly cloudy with temperatures around 15\\\\\\\\u00b0C on Saturday and 17\\\\\\\\u00b0C on Sunday, with a 20% chance of rain on Sunday afternoon.\\\\\"}\"}]',\n",
246-
" 'response_completeness_sample_output': '[{\"role\": \"assistant\", \"content\": \"<S0>Let\\'s think step by step: First, I will identify the key statements in the ground truth: (1) partly cloudy weather in Seattle this weekend, (2) temperatures around 15\\\\u00b0C on Saturday, (3) temperatures around 17\\\\u00b0C on Sunday, (4) 20% chance of rain on Sunday afternoon. The response includes (1) and (2) but omits (3) and (4), which are important details for a complete weekend forecast. Therefore, the response contains about half of the necessary information.</S0>\\\\n<S1>The response provides only the Saturday forecast and omits the Sunday temperature and rain chance, so it is moderately complete.</S1>\\\\n<S2>3</S2>\"}]'}"
247-
]
248-
},
249-
"execution_count": 18,
250-
"metadata": {},
251-
"output_type": "execute_result"
252-
}
253-
],
159+
"outputs": [],
254160
"source": [
255161
"# Conversation format with ground truth in context\n",
256162
"conversation = {\n",
@@ -281,22 +187,9 @@
281187
},
282188
{
283189
"cell_type": "code",
284-
"execution_count": 19,
190+
"execution_count": null,
285191
"metadata": {},
286-
"outputs": [
287-
{
288-
"ename": "KeyError",
289-
"evalue": "'AZURE_SUBSCRIPTION_ID'",
290-
"output_type": "error",
291-
"traceback": [
292-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
293-
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
294-
"Cell \u001b[0;32mIn[19], line 28\u001b[0m\n\u001b[1;32m 21\u001b[0m pd\u001b[38;5;241m.\u001b[39mDataFrame(data)\u001b[38;5;241m.\u001b[39mto_json(\n\u001b[1;32m 22\u001b[0m file_path, orient\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrecords\u001b[39m\u001b[38;5;124m\"\u001b[39m, lines\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 23\u001b[0m )\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mazure\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mai\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mevaluation\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m evaluate\n\u001b[1;32m 27\u001b[0m azure_ai_project\u001b[38;5;241m=\u001b[39m{\n\u001b[0;32m---> 28\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msubscription_id\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menviron\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mAZURE_SUBSCRIPTION_ID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m,\n\u001b[1;32m 29\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproject_name\u001b[39m\u001b[38;5;124m\"\u001b[39m: os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPROJECT_NAME\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 30\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresource_group_name\u001b[39m\u001b[38;5;124m\"\u001b[39m: os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRESOURCE_GROUP_NAME\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 31\u001b[0m }\n\u001b[1;32m 33\u001b[0m response \u001b[38;5;241m=\u001b[39m evaluate(\n\u001b[1;32m 34\u001b[0m data\u001b[38;5;241m=\u001b[39mfile_path,\n\u001b[1;32m 35\u001b[0m evaluators\u001b[38;5;241m=\u001b[39m{\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 38\u001b[0m azure_ai_project\u001b[38;5;241m=\u001b[39mazure_ai_project,\n\u001b[1;32m 39\u001b[0m )\n\u001b[1;32m 41\u001b[0m pprint(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAI Foundry URL: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstudio_url\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
295-
"File \u001b[0;32m~/miniconda3/envs/samples/lib/python3.10/os.py:680\u001b[0m, in \u001b[0;36m_Environ.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 677\u001b[0m value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencodekey(key)]\n\u001b[1;32m 678\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n\u001b[1;32m 679\u001b[0m \u001b[38;5;66;03m# raise KeyError with the original key value\u001b[39;00m\n\u001b[0;32m--> 680\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 681\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecodevalue(value)\n",
296-
"\u001b[0;31mKeyError\u001b[0m: 'AZURE_SUBSCRIPTION_ID'"
297-
]
298-
}
299-
],
192+
"outputs": [],
300193
"source": [
301194
"import json\n",
302195
"import pandas as pd\n",
@@ -344,7 +237,7 @@
344237
],
345238
"metadata": {
346239
"kernelspec": {
347-
"display_name": "test_agent_evaluator_prp",
240+
"display_name": "samples",
348241
"language": "python",
349242
"name": "python3"
350243
},
@@ -358,7 +251,7 @@
358251
"name": "python",
359252
"nbconvert_exporter": "python",
360253
"pygments_lexer": "ipython3",
361-
"version": "3.12.9"
254+
"version": "3.10.19"
362255
}
363256
},
364257
"nbformat": 4,

0 commit comments

Comments
 (0)