AndrewMead10 · gigialc · Mar 23, 2025
diff --git a/src/components/AdvisorPage.js b/src/components/AdvisorPage.js
@@ -57,6 +57,7 @@ const AdvisorPage = () => {
   const [selectedModels, setSelectedModels] = useState([]);
   const [userInput, setUserInput] = useState("");
   const [testPrompt, setTestPrompt] = useState("");
+  const [userBenchmark, setUserBenchmark] = useState("");
   const [promptLoading, setPromptLoading] = useState(false);
   const [results, setResults] = useState([]);
   const [loading, setLoading] = useState(false);
@@ -100,6 +101,11 @@ const AdvisorPage = () => {
     setApiError(""); // Clear previous errors
   };
 
+  // Handle user benchmark changes
+  const handleUserBenchmarkChange = (e) => {
+    setUserBenchmark(e.target.value);
+  };
+
   // Handle generating a test prompt using Maestro
   const handleGeneratePrompt = async () => {
     if (!userInput.trim()) {
@@ -151,10 +157,30 @@ const AdvisorPage = () => {
     setEvaluationResults(null);
     setApiError("");
 
-    const modelResults = [];
+    // Initialize results array, potentially including user benchmark
+    let modelResults = [];
+
+    // If user provided a benchmark answer, include it in the results
+    if (userBenchmark.trim()) {
+      modelResults.push({
+        model: {
+          id: 'benchmark',
+          name: "User Benchmark",
+          provider: "User",
+          version: "N/A",
+          modelId: "user-benchmark"
+        },
+        output: userBenchmark,
+        metrics: {
+          tokensUsed: 0,
+          responseTime: 0,
+          cost: 0,
+        },
+      });
+    }
 
-    // Test each selected model
-    for (const model of selectedModels) {
+    // Define the async function to call a single model
+    const callModel = async (model) => {
       try {
         console.log(`Testing model: ${model.name}`);
 
@@ -192,37 +218,51 @@ const AdvisorPage = () => {
           cost: data.usage.cost,
         };
 
-        modelResults.push({
+        return {
           model,
           output,
           metrics,
-        });
+        };
       } catch (error) {
         console.error(`Error testing model ${model.name}:`, error);
-        modelResults.push({
+        setApiError(prevError => {
+          if (prevError) {
+            return `${prevError}; ${error.message}`;
+          }
+          return error.message;
+        });
+
+        return {
           model,
           output: `Error: ${error.message}`,
           metrics: {
             tokensUsed: 0,
             responseTime: 0,
             cost: 0,
           },
-        });
-        setApiError(prevError => {
-          if (prevError) {
-            return `${prevError}; ${error.message}`;
-          }
-          return error.message;
-        });
+        };
       }
-    }
+    };
 
-    setResults(modelResults);
-    setLoading(false);
-
-    // If we have results, evaluate them using Maestro
-    if (modelResults.length > 0 && modelResults.some(r => !r.output.startsWith('Error'))) {
-      await handleEvaluateResults(testPrompt, modelResults);
+    try {
+      // Call all models in parallel and wait for all results
+      const modelPromises = selectedModels.map(model => callModel(model));
+      const apiResults = await Promise.all(modelPromises);
+
+      // Combine benchmark (if any) with API results
+      modelResults = [...modelResults, ...apiResults];
+
+      setResults(modelResults);
+
+      // If we have results, evaluate them using Maestro
+      if (modelResults.length > 0 && modelResults.some(r => !r.output.startsWith('Error'))) {
+        await handleEvaluateResults(testPrompt, modelResults);
+      }
+    } catch (error) {
+      console.error("Error processing model results:", error);
+      setApiError(`Unexpected error: ${error.message}`);
+    } finally {
+      setLoading(false);
     }
   };
 
@@ -309,6 +349,23 @@ const AdvisorPage = () => {
               onChange={(e) => setTestPrompt(e.target.value)}
             ></textarea>
           </div>
+
+          {/* Add user benchmark input */}
+          <div className="mb-4">
+            <label className="block text-gray-700 mb-2">
+              Your Benchmark Answer (Optional):
+            </label>
+            <textarea
+              className="w-full px-3 py-2 border rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500"
+              rows="4"
+              value={userBenchmark}
+              onChange={handleUserBenchmarkChange}
+              placeholder="Enter your own answer to serve as a benchmark for comparison..."
+            ></textarea>
+            <p className="mt-1 text-xs text-gray-500">
+              Your answer will be included in the evaluation to compare with AI models
+            </p>
+          </div>
         </div>
       )}
 

diff --git a/src/utils/maestroAPI.js b/src/utils/maestroAPI.js
@@ -124,6 +124,9 @@ export const evaluateModelOutputs = async (prompt, modelOutputs) => {
     return mockEvaluateModelOutputs(prompt, modelOutputs);
   }
 
+  // Check if there's a user benchmark
+  const hasBenchmark = modelOutputs.some(item => item.model.id === 'benchmark');
+
   // Format the model outputs for evaluation
   const formattedOutputs = modelOutputs.map(item => ({
     name: item.model.name,
@@ -146,11 +149,13 @@ export const evaluateModelOutputs = async (prompt, modelOutputs) => {
           {
             role: "system",
             content: `You are an expert at evaluating AI model outputs. You will be given a prompt and multiple AI responses to that prompt from different models. 
+            ${hasBenchmark ? "One of the responses is labeled as 'User Benchmark' - this is the user's own answer and should be used as a reference point for evaluating the other AI responses." : ""}
             Analyze each response for accuracy, clarity, creativity, and usefulness. Provide a detailed comparison in markdown format that includes:
 
             1. Summary of strengths and weaknesses for each model
             2. Comparative analysis across all models
-            3. Recommendation on which model performed best and why`
+            3. Recommendation on which model performed best and why
+            ${hasBenchmark ? "4. How the AI models compare to the user's benchmark answer" : ""}`
           },
           {
             role: "user",
@@ -216,6 +221,7 @@ export const mockMaestroPromptGeneration = async (userInput) => {
 const mockEvaluateModelOutputs = async (prompt, modelOutputs) => {
   // Create a basic evaluation
   const modelNames = modelOutputs.map(item => item.model.name).join(', ');
+  const hasBenchmark = modelOutputs.some(item => item.model.id === 'benchmark');
 
   return `# Model Evaluation
 
@@ -243,6 +249,15 @@ The models show different approaches to the same prompt, with some providing mor
 
 Based on the overall quality, completeness, and usefulness of the responses, ${modelOutputs[0]?.model.name || 'the first model'} appears to provide the most comprehensive and useful response to the given prompt.
 
+${hasBenchmark ? `
+## Comparison to User Benchmark
+
+The User Benchmark provides a human perspective that differs from the AI models in the following ways:
+- May show different reasoning approaches
+- Often more concise or focused on practical aspects
+- Provides a useful reference point for evaluating AI responses
+` : ''}
+
 *Note: This is an auto-generated evaluation as a fallback when the AI evaluation service is unavailable.*
 `;
 };