-
Notifications
You must be signed in to change notification settings - Fork 42
Add gulf genkit validator #314
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
/gemini summarize |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code Review
This pull request introduces a new spike package, gulf_genkit_eval
, to evaluate various large language models for generating UI components from a schema. The implementation includes a Genkit flow, JSON schemas for UI components, a set of prompts and models for testing, and a custom validator to check the generated output. The overall structure is well-organized for an experimental package. My review includes several suggestions to improve correctness, maintainability, and robustness. I've identified a critical issue with a stale compiled file being included in the commit, which should be addressed. Other feedback points to improving environment variable handling, fixing an error in the README, making the validator more robust, and enhancing type safety and consistency. Also, please note that the repository's style guide mentions a Pre-Review Checklist from the PR template, which seems to be missing from this pull request's description.[^1]
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || (function () { | ||
var ownKeys = function(o) { | ||
ownKeys = Object.getOwnPropertyNames || function (o) { | ||
var ar = []; | ||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; | ||
return ar; | ||
}; | ||
return ownKeys(o); | ||
}; | ||
return function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
})(); | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.componentGeneratorFlow = void 0; | ||
const google_genai_1 = require("@genkit-ai/google-genai"); | ||
const genkit_1 = require("genkit"); | ||
const fs = __importStar(require("fs")); | ||
const path = __importStar(require("path")); | ||
const openai_1 = require("@genkit-ai/compat-oai/openai"); | ||
const genkitx_anthropic_1 = require("genkitx-anthropic"); | ||
// Read the schema file | ||
const schemaString = fs.readFileSync(path.join(__dirname, 'schema.json'), 'utf-8'); | ||
const schema = JSON.parse(schemaString); | ||
const ai = (0, genkit_1.genkit)({ | ||
plugins: [(0, google_genai_1.googleAI)({ apiKey: process.env.GEMINI_API_KEY }), (0, openai_1.openAI)(), (0, genkitx_anthropic_1.anthropic)({ apiKey: process.env.ANTHROPIC_API_KEY }),], | ||
}); | ||
// Define a UI component generator flow | ||
exports.componentGeneratorFlow = ai.defineFlow({ | ||
name: 'componentGeneratorFlow', | ||
inputSchema: genkit_1.z.object({ prompt: genkit_1.z.string(), model: genkit_1.z.any() }), | ||
outputSchema: genkit_1.z.any(), | ||
}, async ({ prompt, model }) => { | ||
// Generate structured component data using the schema from the file | ||
const { output } = await ai.generate({ | ||
prompt, | ||
model, | ||
output: { jsonSchema: schema }, | ||
// config: { | ||
// thinkingConfig: { thinkingBudget: 0 } | ||
// }, | ||
}); | ||
if (!output) | ||
throw new Error('Failed to generate component'); | ||
return output; | ||
}); | ||
// Run the flow | ||
async function main() { | ||
const models = [ | ||
openai_1.openAI.model('gpt-5-mini'), | ||
openai_1.openAI.model('gpt-5'), | ||
openai_1.openAI.model('gpt-5-nano'), | ||
google_genai_1.googleAI.model('gemini-2.5-flash'), | ||
google_genai_1.googleAI.model('gemini-2.5-flash-lite'), | ||
genkitx_anthropic_1.claude4Sonnet, | ||
genkitx_anthropic_1.claude35Haiku, | ||
]; | ||
const prompt = `Generate a JSON conforming to the schema to describe the following UI: | ||
|
||
A root node has already been created with ID "root". You need to create a ComponentUpdate message now. | ||
|
||
A vertical list with: | ||
Dog breed information | ||
Dog generator | ||
|
||
The dog breed information is a card, which contains a title “Famous Dog breeds”, a header image, and a carousel of different dog breeds. The carousel information should be in the data model at /carousel. | ||
|
||
The dog generator is another card which is a form that generates a fictional dog breed with a description | ||
- Title | ||
- Description text explaining what it is | ||
- Dog breed name (text input) | ||
- Number of legs (number input) | ||
- Skills (checkboxes) | ||
- Button called “Generate” which takes the data above and generates a new dog description | ||
- A divider | ||
- A section which shows the generated content | ||
`; | ||
for (const model of models) { | ||
console.log(`Generating component with model: ${model.name}`); | ||
const component = await (0, exports.componentGeneratorFlow)({ | ||
prompt, | ||
model, | ||
}); | ||
console.log(JSON.stringify(component, null, 2)); | ||
} | ||
} | ||
main().catch(console.error); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The compiled JavaScript file lib/index.js
is out of sync with its TypeScript source src/index.ts
. For example, lib/index.js
reads a single schema.json
file, while src/index.ts
dynamically reads different schema files based on the prompt. This will lead to incorrect behavior. Compiled files should generally not be committed to version control. They should be generated as part of a build process. Please remove lib/index.js
from the repository and add lib/
to your .gitignore
file.
import { validateSchema } from './validator'; | ||
|
||
const ai = genkit({ | ||
plugins: [googleAI({ apiKey: process.env.GEMINI_API_KEY! }), openAI(), anthropic({ apiKey: process.env.ANTHROPIC_API_KEY }), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code uses a non-null assertion (!
) for process.env.GEMINI_API_KEY
and implicitly assumes process.env.ANTHROPIC_API_KEY
and process.env.OPENAI_API_KEY
are present. This can lead to runtime errors if these environment variables are not set. It's safer to validate all required environment variables at application startup and exit gracefully with a clear error message if any are missing.
export const componentGeneratorFlow = ai.defineFlow( | ||
{ | ||
name: 'componentGeneratorFlow', | ||
inputSchema: z.object({ prompt: z.string(), model: z.any(), config: z.any().optional(), schema: z.any() }), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The inputSchema
for componentGeneratorFlow
uses z.any()
for model
, config
, and schema
. While convenient for a spike, this sacrifices type safety. Consider defining more specific Zod schemas for these inputs to improve code robustness and clarity. For example, schema
could be validated as z.record(z.any())
or a more detailed schema definition that matches the structure of your JSON schemas.
To run the test with the `gpt-5-nano (reasoning: minimal)` model and the `generateDogUIs` prompt, use the following command: | ||
|
||
```bash | ||
npx tsx src/index.ts --model='gpt-5-nano (reasoning: minimal)' --prompt=generateDogUIs |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The example command in the README uses --prompt=generateDogUIs
, but the actual prompt name defined in src/prompts.ts
is dogBreedGenerator
. This will cause the command to fail. Please update the README to use the correct prompt name.
npx tsx src/index.ts --model='gpt-5-nano (reasoning: minimal)' --prompt=generateDogUIs | |
npx tsx src/index.ts --model='gpt-5-nano (reasoning: minimal)' --prompt=dogBreedGenerator |
"min_value": { | ||
"type": "number", | ||
"default": 0, | ||
"description": "For 'Slider' type: The minimum value." | ||
}, | ||
"max_value": { | ||
"type": "number", | ||
"default": 100, | ||
"description": "For 'Slider' type: The maximum value." | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The properties min_value
and max_value
for the Slider
component use snake_case. This is inconsistent with the rest of the schema, which predominantly uses camelCase for property names (e.g., tabItems
, entryPointChild
). For consistency, please consider renaming them to minValue
and maxValue
.
"min_value": { | |
"type": "number", | |
"default": 0, | |
"description": "For 'Slider' type: The minimum value." | |
}, | |
"max_value": { | |
"type": "number", | |
"default": 100, | |
"description": "For 'Slider' type: The maximum value." | |
} | |
"minValue": { | |
"type": "number", | |
"default": 0, | |
"description": "For 'Slider' type: The minimum value." | |
}, | |
"maxValue": { | |
"type": "number", | |
"default": 100, | |
"description": "For 'Slider' type: The maximum value." | |
} |
break; | ||
case 'Divider': | ||
break; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The switch
statement in validateComponent
for component.type
does not have a default
case. If a component with an unknown type is encountered, it will be silently ignored by the validator. It would be better to add a default
case to log an error for unexpected component types.
default:
errors.push(`Unknown component type '${component.type}' for component ID '${component.id}'.`);
}
Summary of ChangesThis pull request introduces a new Genkit evaluation project aimed at assessing the proficiency of various large language models in generating structured UI components according to a predefined JSON schema. It establishes a comprehensive framework for testing LLM outputs against both standard JSON schema validation and custom, more intricate validation rules, thereby ensuring the generated UI structures adhere to complex protocol constraints. The project facilitates a systematic comparison of different LLMs' capabilities in this domain. Highlights
Changelog
Activity
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Summary of Changes
This pull request introduces a new Genkit evaluation project aimed at assessing the proficiency of various large language models in generating structured UI components according to a predefined JSON schema. It establishes a comprehensive framework for testing LLM outputs against both standard JSON schema validation and custom, more intricate validation rules, thereby ensuring the generated UI structures adhere to complex protocol constraints. The project facilitates a systematic comparison of different LLMs' capabilities in this domain.
Highlights
packages/spikes/gulf_genkit_eval
has been added, containing a Genkit-based project for evaluating the structured output capabilities of various large language models (LLMs).StreamHeader
,ComponentUpdate
,DataModelUpdate
, andBeginRendering
messages, defining a robust protocol for streaming UI components.src/validator.ts
) has been implemented to enforce complex validation rules on generated UI components, ensuring uniqueness of IDs, reference integrity, and component-specific property requirements that cannot be easily expressed in standard JSON schema.src/index.ts
) sets up a flow to test different LLMs (OpenAI, Google AI, Anthropic) against a set of predefined prompts and schemas, providing a structured way to compare their performance in generating valid UI structures.README.md
for execution instructions,GEMINI.md
detailing validation logic, and standard Genkit/TypeScript configuration files (genkit.conf.js
,package.json
,tsconfig.json
,.gitignore
).Changelog
componentGeneratorFlow
, loads schemas, and orchestrates the testing process across multiple LLMs and prompts, incorporating custom validation logic.ModelConfiguration
interface and an arraymodelsToTest
, listing various LLM models from OpenAI, Google AI, and Anthropic with their respective configurations for evaluation.TestPrompt
interface and an arrayprompts
, containing various prompts for UI component generation scenarios and their associated schemas.validateSchema
function and several helper functions to perform custom validation rules on generated UI components, checking for uniqueness, reference integrity, and component-specific property adherence.Activity
lib/index.js
file being out of sync with its TypeScript source and suggested removing it and addinglib/
to.gitignore
.z.any()
being used formodel
,config
, andschema
incomponentGeneratorFlow
'sinputSchema
, suggesting more specific Zod schemas for better type safety.README.md
example command, where the prompt namegenerateDogUIs
was incorrect and should bedogBreedGenerator
.min_value
andmax_value
tominValue
andmaxValue
in theSlider
component schema.validateComponent
function, advising the addition of adefault
case to itsswitch
statement to catch unknown component types.