Azure-Samples
diff --git a/‎example_config.json‎
Lines changed: 1 addition & 1 deletion b/‎example_config.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎example_results/baseline_4/config.json‎ renamed to ‎example_results/baseline_1/config.json‎
Lines changed: 2 additions & 2 deletions b/‎example_results/baseline_4/config.json‎ renamed to ‎example_results/baseline_1/config.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎example_results/baseline_1/eval_results.jsonl‎
Lines changed: 200 additions & 200 deletions b/‎example_results/baseline_1/eval_results.jsonl‎
Lines changed: 200 additions & 200 deletions
diff --git a/‎example_results/baseline_1/evaluate_parameters.json‎
Lines changed: 9 additions & 7 deletions b/‎example_results/baseline_1/evaluate_parameters.json‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎example_results/baseline_1/summary.json‎
Lines changed: 21 additions & 22 deletions b/‎example_results/baseline_1/summary.json‎
Lines changed: 21 additions & 22 deletions
diff --git a/‎example_results/baseline_2/config.json‎
Lines changed: 11 additions & 0 deletions b/‎example_results/baseline_2/config.json‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎example_results/baseline_2/eval_results.jsonl‎
Lines changed: 200 additions & 200 deletions b/‎example_results/baseline_2/eval_results.jsonl‎
Lines changed: 200 additions & 200 deletions
diff --git a/‎example_results/baseline_2/evaluate_parameters.json‎
Lines changed: 9 additions & 7 deletions b/‎example_results/baseline_2/evaluate_parameters.json‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎example_results/baseline_2/summary.json‎
Lines changed: 19 additions & 20 deletions b/‎example_results/baseline_2/summary.json‎
Lines changed: 19 additions & 20 deletions
diff --git a/‎example_results/baseline_4/eval_results.jsonl‎
Lines changed: 0 additions & 200 deletions b/‎example_results/baseline_4/eval_results.jsonl‎
Lines changed: 0 additions & 200 deletions
@@ -1,7 +1,7 @@
 {
     "testdata_path": "example_input/qa.jsonl",
     "results_dir": "example_results/experiment<TIMESTAMP>",
-    "requested_metrics": ["relevance", "coherence", "groundedness", "latency", "answer_length"],
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"],
     "target_url": "http://localhost:50505/chat",
     "target_parameters": {
         "overrides": {
 
@@ -1,8 +1,8 @@
 {
     "testdata_path": "example_input/qa.jsonl",
     "results_dir": "example_results/baseline_4",
-    "requested_metrics": ["groundedness", "relevance", "coherence", "has_citation", "answer_length", "latency"],
-    "target_url": "http://host.docker.internal:50505/chat",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "has_citation", "answer_length", "latency"],
+    "target_url": "http://host.docker.internal:54846/chat",
     "target_parameters": {
         "overrides": {
             "semantic_ranker": false
 
@@ -1,10 +1,12 @@
 {
-    "overrides": {
-        "retrieval_mode": "hybrid",
-        "semantic_ranker": false,
-        "semantic_captions": false,
-        "top": 3,
-        "suggest_followup_questions": false
+    "evaluation_gpt_model": "gpt-4",
+    "evaluation_timestamp": 1708042205,
+    "testdata_path": "/workspaces/ai-rag-chat-evaluator/example_input/qa.jsonl",
+    "target_url": "http://host.docker.internal:54846/chat",
+    "target_parameters": {
+        "overrides": {
+            "semantic_ranker": false
+        }
     },
-    "evaluation_gpt_model": "gpt-4"
+    "num_questions": 200
 }
@@ -1,32 +1,31 @@
 {
-    "gpt_groundedness": {
-        "mean_rating": 4.79,
-        "pass_count": 186,
-        "pass_rate": 0.93
-    },
-    "gpt_relevance": {
-        "mean_rating": 4.78,
-        "pass_count": 188,
-        "pass_rate": 0.94
-    },
     "gpt_coherence": {
-        "mean_rating": 4.84,
-        "pass_count": 192,
+        "mean_rating": 4.85,
+        "pass_count": 193,
         "pass_rate": 0.96
     },
-    "gpt_similarity": {
-        "mean_rating": 3.44,
-        "pass_count": 124,
-        "pass_rate": 0.62
+    "gpt_relevance": {
+        "mean_rating": 4.9,
+        "pass_count": 194,
+        "pass_rate": 0.97
+    },
+    "gpt_groundedness": {
+        "mean_rating": 4.95,
+        "pass_count": 197,
+        "pass_rate": 0.98
     },
     "answer_length": {
-        "total": 129793,
-        "mean": 648.97,
-        "max": 2575,
-        "min": 93
+        "mean": 687.84,
+        "max": 2308,
+        "min": 101
     },
     "answer_has_citation": {
-        "total": 197,
-        "rate": 0.98
+        "total": 200,
+        "rate": 1.0
+    },
+    "latency": {
+        "mean": 2.83,
+        "max": 6.582298,
+        "min": 1.63455
     }
 }
@@ -0,0 +1,11 @@
+{
+    "testdata_path": "example_input/qa.jsonl",
+    "results_dir": "example_results/baseline_2",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "has_citation", "answer_length", "latency"],
+    "target_url": "http://host.docker.internal:54846/chat",
+    "target_parameters": {
+        "overrides": {
+            "semantic_ranker": false
+        }
+    }
+}
@@ -1,10 +1,12 @@
 {
-    "overrides": {
-        "retrieval_mode": "hybrid",
-        "semantic_ranker": false,
-        "semantic_captions": false,
-        "top": 3,
-        "suggest_followup_questions": false
+    "evaluation_gpt_model": "gpt-4",
+    "evaluation_timestamp": 1708043241,
+    "testdata_path": "/workspaces/ai-rag-chat-evaluator/example_input/qa.jsonl",
+    "target_url": "http://host.docker.internal:54846/chat",
+    "target_parameters": {
+        "overrides": {
+            "semantic_ranker": false
+        }
     },
-    "evaluation_gpt_model": "gpt-4"
+    "num_questions": 200
 }
@@ -1,32 +1,31 @@
 {
     "gpt_groundedness": {
-        "mean_rating": 4.89,
-        "pass_count": 192,
-        "pass_rate": 0.96
+        "mean_rating": 4.96,
+        "pass_count": 197,
+        "pass_rate": 0.98
     },
     "gpt_relevance": {
-        "mean_rating": 4.76,
-        "pass_count": 188,
-        "pass_rate": 0.94
+        "mean_rating": 4.88,
+        "pass_count": 195,
+        "pass_rate": 0.97
     },
     "gpt_coherence": {
-        "mean_rating": 4.8,
-        "pass_count": 192,
-        "pass_rate": 0.96
+        "mean_rating": 4.85,
+        "pass_count": 194,
+        "pass_rate": 0.97
     },
-    "gpt_similarity": {
-        "mean_rating": 3.45,
-        "pass_count": 124,
-        "pass_rate": 0.62
+    "has_citation": {
+        "total": 198,
+        "rate": 0.99
     },
     "answer_length": {
-        "total": 131811,
-        "mean": 659.05,
-        "max": 2299,
-        "min": 90
+        "mean": 642.76,
+        "max": 2120,
+        "min": 96
     },
-    "answer_has_citation": {
-        "total": 197,
-        "rate": 0.98
+    "latency": {
+        "mean": 2.68,
+        "max": 5.640083,
+        "min": 1.563895
     }
 }
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"testdata_path": "example_input/qa.jsonl",`
`3`	`3`	`"results_dir": "example_results/experiment<TIMESTAMP>",`
`4`		`- "requested_metrics": ["relevance", "coherence", "groundedness", "latency", "answer_length"],`
	`4`	`+ "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"],`
`5`	`5`	`"target_url": "http://localhost:50505/chat",`
`6`	`6`	`"target_parameters": {`
`7`	`7`	`"overrides": {`