Skip to content

Commit a5246f9

Browse files
authored
Merge pull request #52 from Azure-Samples/fixgpt
Fix data mapping to match new evaluate SDK expectations
2 parents 7a12df7 + 1b4a680 commit a5246f9

File tree

13 files changed

+477
-706
lines changed

13 files changed

+477
-706
lines changed

example_config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"testdata_path": "example_input/qa.jsonl",
33
"results_dir": "example_results/experiment<TIMESTAMP>",
4-
"requested_metrics": ["relevance", "coherence", "groundedness", "latency", "answer_length"],
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"],
55
"target_url": "http://localhost:50505/chat",
66
"target_parameters": {
77
"overrides": {

example_results/baseline_4/config.json renamed to example_results/baseline_1/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"testdata_path": "example_input/qa.jsonl",
33
"results_dir": "example_results/baseline_4",
4-
"requested_metrics": ["groundedness", "relevance", "coherence", "has_citation", "answer_length", "latency"],
5-
"target_url": "http://host.docker.internal:50505/chat",
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "has_citation", "answer_length", "latency"],
5+
"target_url": "http://host.docker.internal:54846/chat",
66
"target_parameters": {
77
"overrides": {
88
"semantic_ranker": false

example_results/baseline_1/eval_results.jsonl

Lines changed: 200 additions & 200 deletions
Large diffs are not rendered by default.
Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
{
2-
"overrides": {
3-
"retrieval_mode": "hybrid",
4-
"semantic_ranker": false,
5-
"semantic_captions": false,
6-
"top": 3,
7-
"suggest_followup_questions": false
2+
"evaluation_gpt_model": "gpt-4",
3+
"evaluation_timestamp": 1708042205,
4+
"testdata_path": "/workspaces/ai-rag-chat-evaluator/example_input/qa.jsonl",
5+
"target_url": "http://host.docker.internal:54846/chat",
6+
"target_parameters": {
7+
"overrides": {
8+
"semantic_ranker": false
9+
}
810
},
9-
"evaluation_gpt_model": "gpt-4"
11+
"num_questions": 200
1012
}
Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,31 @@
11
{
2-
"gpt_groundedness": {
3-
"mean_rating": 4.79,
4-
"pass_count": 186,
5-
"pass_rate": 0.93
6-
},
7-
"gpt_relevance": {
8-
"mean_rating": 4.78,
9-
"pass_count": 188,
10-
"pass_rate": 0.94
11-
},
122
"gpt_coherence": {
13-
"mean_rating": 4.84,
14-
"pass_count": 192,
3+
"mean_rating": 4.85,
4+
"pass_count": 193,
155
"pass_rate": 0.96
166
},
17-
"gpt_similarity": {
18-
"mean_rating": 3.44,
19-
"pass_count": 124,
20-
"pass_rate": 0.62
7+
"gpt_relevance": {
8+
"mean_rating": 4.9,
9+
"pass_count": 194,
10+
"pass_rate": 0.97
11+
},
12+
"gpt_groundedness": {
13+
"mean_rating": 4.95,
14+
"pass_count": 197,
15+
"pass_rate": 0.98
2116
},
2217
"answer_length": {
23-
"total": 129793,
24-
"mean": 648.97,
25-
"max": 2575,
26-
"min": 93
18+
"mean": 687.84,
19+
"max": 2308,
20+
"min": 101
2721
},
2822
"answer_has_citation": {
29-
"total": 197,
30-
"rate": 0.98
23+
"total": 200,
24+
"rate": 1.0
25+
},
26+
"latency": {
27+
"mean": 2.83,
28+
"max": 6.582298,
29+
"min": 1.63455
3130
}
3231
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"testdata_path": "example_input/qa.jsonl",
3+
"results_dir": "example_results/baseline_2",
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "has_citation", "answer_length", "latency"],
5+
"target_url": "http://host.docker.internal:54846/chat",
6+
"target_parameters": {
7+
"overrides": {
8+
"semantic_ranker": false
9+
}
10+
}
11+
}

example_results/baseline_2/eval_results.jsonl

Lines changed: 200 additions & 200 deletions
Large diffs are not rendered by default.
Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
{
2-
"overrides": {
3-
"retrieval_mode": "hybrid",
4-
"semantic_ranker": false,
5-
"semantic_captions": false,
6-
"top": 3,
7-
"suggest_followup_questions": false
2+
"evaluation_gpt_model": "gpt-4",
3+
"evaluation_timestamp": 1708043241,
4+
"testdata_path": "/workspaces/ai-rag-chat-evaluator/example_input/qa.jsonl",
5+
"target_url": "http://host.docker.internal:54846/chat",
6+
"target_parameters": {
7+
"overrides": {
8+
"semantic_ranker": false
9+
}
810
},
9-
"evaluation_gpt_model": "gpt-4"
11+
"num_questions": 200
1012
}
Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,31 @@
11
{
22
"gpt_groundedness": {
3-
"mean_rating": 4.89,
4-
"pass_count": 192,
5-
"pass_rate": 0.96
3+
"mean_rating": 4.96,
4+
"pass_count": 197,
5+
"pass_rate": 0.98
66
},
77
"gpt_relevance": {
8-
"mean_rating": 4.76,
9-
"pass_count": 188,
10-
"pass_rate": 0.94
8+
"mean_rating": 4.88,
9+
"pass_count": 195,
10+
"pass_rate": 0.97
1111
},
1212
"gpt_coherence": {
13-
"mean_rating": 4.8,
14-
"pass_count": 192,
15-
"pass_rate": 0.96
13+
"mean_rating": 4.85,
14+
"pass_count": 194,
15+
"pass_rate": 0.97
1616
},
17-
"gpt_similarity": {
18-
"mean_rating": 3.45,
19-
"pass_count": 124,
20-
"pass_rate": 0.62
17+
"has_citation": {
18+
"total": 198,
19+
"rate": 0.99
2120
},
2221
"answer_length": {
23-
"total": 131811,
24-
"mean": 659.05,
25-
"max": 2299,
26-
"min": 90
22+
"mean": 642.76,
23+
"max": 2120,
24+
"min": 96
2725
},
28-
"answer_has_citation": {
29-
"total": 197,
30-
"rate": 0.98
26+
"latency": {
27+
"mean": 2.68,
28+
"max": 5.640083,
29+
"min": 1.563895
3130
}
3231
}

example_results/baseline_4/eval_results.jsonl

Lines changed: 0 additions & 200 deletions
This file was deleted.

0 commit comments

Comments
 (0)