Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 77 additions & 20 deletions src/components/AdvisorPage.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ const AdvisorPage = () => {
const [selectedModels, setSelectedModels] = useState([]);
const [userInput, setUserInput] = useState("");
const [testPrompt, setTestPrompt] = useState("");
const [userBenchmark, setUserBenchmark] = useState("");
const [promptLoading, setPromptLoading] = useState(false);
const [results, setResults] = useState([]);
const [loading, setLoading] = useState(false);
Expand Down Expand Up @@ -100,6 +101,11 @@ const AdvisorPage = () => {
setApiError(""); // Clear previous errors
};

// Handle user benchmark changes
const handleUserBenchmarkChange = (e) => {
setUserBenchmark(e.target.value);
};

// Handle generating a test prompt using Maestro
const handleGeneratePrompt = async () => {
if (!userInput.trim()) {
Expand Down Expand Up @@ -151,10 +157,30 @@ const AdvisorPage = () => {
setEvaluationResults(null);
setApiError("");

const modelResults = [];
// Initialize results array, potentially including user benchmark
let modelResults = [];

// If user provided a benchmark answer, include it in the results
if (userBenchmark.trim()) {
modelResults.push({
model: {
id: 'benchmark',
name: "User Benchmark",
provider: "User",
version: "N/A",
modelId: "user-benchmark"
},
output: userBenchmark,
metrics: {
tokensUsed: 0,
responseTime: 0,
cost: 0,
},
});
}

// Test each selected model
for (const model of selectedModels) {
// Define the async function to call a single model
const callModel = async (model) => {
try {
console.log(`Testing model: ${model.name}`);

Expand Down Expand Up @@ -192,37 +218,51 @@ const AdvisorPage = () => {
cost: data.usage.cost,
};

modelResults.push({
return {
model,
output,
metrics,
});
};
} catch (error) {
console.error(`Error testing model ${model.name}:`, error);
modelResults.push({
setApiError(prevError => {
if (prevError) {
return `${prevError}; ${error.message}`;
}
return error.message;
});

return {
model,
output: `Error: ${error.message}`,
metrics: {
tokensUsed: 0,
responseTime: 0,
cost: 0,
},
});
setApiError(prevError => {
if (prevError) {
return `${prevError}; ${error.message}`;
}
return error.message;
});
};
}
}
};

setResults(modelResults);
setLoading(false);

// If we have results, evaluate them using Maestro
if (modelResults.length > 0 && modelResults.some(r => !r.output.startsWith('Error'))) {
await handleEvaluateResults(testPrompt, modelResults);
try {
// Call all models in parallel and wait for all results
const modelPromises = selectedModels.map(model => callModel(model));
const apiResults = await Promise.all(modelPromises);

// Combine benchmark (if any) with API results
modelResults = [...modelResults, ...apiResults];

setResults(modelResults);

// If we have results, evaluate them using Maestro
if (modelResults.length > 0 && modelResults.some(r => !r.output.startsWith('Error'))) {
await handleEvaluateResults(testPrompt, modelResults);
}
} catch (error) {
console.error("Error processing model results:", error);
setApiError(`Unexpected error: ${error.message}`);
} finally {
setLoading(false);
}
};

Expand Down Expand Up @@ -309,6 +349,23 @@ const AdvisorPage = () => {
onChange={(e) => setTestPrompt(e.target.value)}
></textarea>
</div>

{/* Add user benchmark input */}
<div className="mb-4">
<label className="block text-gray-700 mb-2">
Your Benchmark Answer (Optional):
</label>
<textarea
className="w-full px-3 py-2 border rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500"
rows="4"
value={userBenchmark}
onChange={handleUserBenchmarkChange}
placeholder="Enter your own answer to serve as a benchmark for comparison..."
></textarea>
<p className="mt-1 text-xs text-gray-500">
Your answer will be included in the evaluation to compare with AI models
</p>
</div>
</div>
)}

Expand Down
17 changes: 16 additions & 1 deletion src/utils/maestroAPI.js
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ export const evaluateModelOutputs = async (prompt, modelOutputs) => {
return mockEvaluateModelOutputs(prompt, modelOutputs);
}

// Check if there's a user benchmark
const hasBenchmark = modelOutputs.some(item => item.model.id === 'benchmark');

// Format the model outputs for evaluation
const formattedOutputs = modelOutputs.map(item => ({
name: item.model.name,
Expand All @@ -146,11 +149,13 @@ export const evaluateModelOutputs = async (prompt, modelOutputs) => {
{
role: "system",
content: `You are an expert at evaluating AI model outputs. You will be given a prompt and multiple AI responses to that prompt from different models.
${hasBenchmark ? "One of the responses is labeled as 'User Benchmark' - this is the user's own answer and should be used as a reference point for evaluating the other AI responses." : ""}
Analyze each response for accuracy, clarity, creativity, and usefulness. Provide a detailed comparison in markdown format that includes:

1. Summary of strengths and weaknesses for each model
2. Comparative analysis across all models
3. Recommendation on which model performed best and why`
3. Recommendation on which model performed best and why
${hasBenchmark ? "4. How the AI models compare to the user's benchmark answer" : ""}`
},
{
role: "user",
Expand Down Expand Up @@ -216,6 +221,7 @@ export const mockMaestroPromptGeneration = async (userInput) => {
const mockEvaluateModelOutputs = async (prompt, modelOutputs) => {
// Create a basic evaluation
const modelNames = modelOutputs.map(item => item.model.name).join(', ');
const hasBenchmark = modelOutputs.some(item => item.model.id === 'benchmark');

return `# Model Evaluation

Expand Down Expand Up @@ -243,6 +249,15 @@ The models show different approaches to the same prompt, with some providing mor

Based on the overall quality, completeness, and usefulness of the responses, ${modelOutputs[0]?.model.name || 'the first model'} appears to provide the most comprehensive and useful response to the given prompt.

${hasBenchmark ? `
## Comparison to User Benchmark

The User Benchmark provides a human perspective that differs from the AI models in the following ways:
- May show different reasoning approaches
- Often more concise or focused on practical aspects
- Provides a useful reference point for evaluating AI responses
` : ''}

*Note: This is an auto-generated evaluation as a fallback when the AI evaluation service is unavailable.*
`;
};
Expand Down