angular
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 21 additions & 0 deletions b/‎.github/workflows/build.yml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 15 additions & 0 deletions b/‎.gitignore‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.prettierrc.json‎
Lines changed: 15 additions & 0 deletions b/‎.prettierrc.json‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 93 additions & 0 deletions b/‎README.md‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎docs/environment-reference.md‎
Lines changed: 183 additions & 0 deletions b/‎docs/environment-reference.md‎
Lines changed: 183 additions & 0 deletions
diff --git a/‎docs/model-setup.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/model-setup.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/environments/angular/config.js‎
Lines changed: 12 additions & 0 deletions b/‎examples/environments/angular/config.js‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/environments/angular/project/.gitignore‎
Lines changed: 42 additions & 0 deletions b/‎examples/environments/angular/project/.gitignore‎
Lines changed: 42 additions & 0 deletions
@@ -0,0 +1,21 @@
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/setup-node@d7a11313b581b306c961b506cfc8971208bb03f6
+        with:
+          node-version: 24
+      - uses: pnpm/action-setup@f2b2b233b538f500472c7274c7012f57857d8ce0
+        with:
+          version: 9
+      - run: pnpm i --frozen-lockfile
+      - run: pnpm check-format
+      - run: pnpm release-build
@@ -0,0 +1,15 @@
+dist
+.tmp/
+.report-migration-backups
+.DS_Store
+.vscode
+safety-web.log
+node_modules
+
+report-app/node_modules
+report-app/dist
+report-app/.angular
+report-app/.vscode
+report-app/.reports
+
+.web-codegen-scorer
@@ -0,0 +1,15 @@
+{
+  "semi": true,
+  "trailingComma": "es5",
+  "singleQuote": true,
+  "printWidth": 80,
+  "tabWidth": 2,
+  "overrides": [
+    {
+      "files": "*.html",
+      "options": {
+        "parser": "angular"
+      }
+    }
+  ]
+}
@@ -0,0 +1,93 @@
+# Web Codegen Scorer
+
+This project is a tool designed to assess the quality of front-end code generated by Large Language Models (LLMs).
+
+## Documentation directory
+
+- [Environment config reference](./docs/environment-reference.md)
+- [How to set up a new model?](./docs/model-setup.md)
+
+## Setup
+
+1.  **Install the package:**
+```bash
+npm install -g web-codegen-scorer
+```
+
+2.  **Set up your API keys:**
+In order to run an eval, you have to specify an API keys for the relevant providers as environment variables:
+```bash
+export GEMINI_API_KEY="YOUR_API_KEY_HERE" # If you're using Gemini models
+export OPENAI_API_KEY="YOUR_API_KEY_HERE" # If you're using OpenAI models
+export ANTHROPIC_API_KEY="YOUR_API_KEY_HERE" # If you're using Anthropic models
+```
+
+3. **Run an eval:**
+You can run your first eval using our Angular example with the following command:
+```bash
+web-codegen-scorer eval --env=angular-example
+```
+
+4. (Optional) **Set up your own eval:**
+If you want to set up a custom eval, instead of using our built-in examples, you can run the following
+command which will guide you through the process:
+
+```bash
+web-codegen-scorer init
+```
+
+## Command-line flags
+
+You can customize the `web-codegen-scorer eval` script with the following flags:
+
+- `--env=<path>` (alias: `--environment`): (**Required**) Specifies the path from which to load the environment config.
+  - Example: `web-codegen-scorer eval --env=foo/bar/my-env.js`
+
+- `--model=<name>`: Specifies the model to use when generating code. Defaults to the value of `DEFAULT_MODEL_NAME`.
+  - Example: `web-codegen-scorer eval --model=gemini-2.5-flash --env=<config path>`
+
+- `--runner=<name>`: Specifies the runner to use to execute the eval. Supported runners are `genkit` (default) or `gemini-cli`.
+
+- `--local`: Runs the script in local mode for the initial code generation request. Instead of calling the LLM, it will attempt to read the initial code from a corresponding file in the `.llm-output` directory (e.g., `.llm-output/todo-app.ts`). This is useful for re-running assessments or debugging the build/repair process without incurring LLM costs for the initial generation.
+  - **Note:** You typically need to run `web-codegen-scorer eval` once without `--local` to generate the initial files in `.llm-output`.
+  - The `web-codegen-scorer eval:local` script is a shortcut for `web-codegen-scorer eval --local`.
+
+- `--limit=<number>`: Specifies the number of application prompts to process. Defaults to `5`.
+  - Example: `web-codegen-scorer eval --limit=10 --env=<config path>`
+
+- `--output-directory=<name>` (alias: `--output-dir`): Specifies which directory to output the generated code under which is useful for debugging. By default the code will be generated in a temporary directory.
+  - Example: `web-codegen-scorer eval --output-dir=test-output --env=<config path>`
+
+- `--concurrency=<number>`: Sets the maximum number of concurrent AI API requests. Defaults to `5` (as defined by `DEFAULT_CONCURRENCY` in `src/config.ts`).
+  - Example: `web-codegen-scorer eval --concurrency=3 --env=<config path>`
+
+- `--report-name=<name>`: Sets the name for the generated report directory. Defaults to a timestamp (e.g., `2023-10-27T10-30-00-000Z`). The name will be sanitized (non-alphanumeric characters replaced with hyphens).
+  - Example: `web-codegen-scorer eval --report-name=my-custom-report --env=<config path>`
+
+- `--rag-endpoint=<url>`: Specifies a custom RAG (Retrieval-Augmented Generation) endpoint URL. The URL must contain a `PROMPT` substring, which will be replaced with the user prompt.
+  - Example: `web-codegen-scorer eval --rag-endpoint="http://localhost:8080/my-rag-endpoint?query=PROMPT" --env=<config path>`
+
+- `--prompt-filter=<name>`: String used to filter which prompts should be run. By default a random sample (controlled by `--limit`) will be taken from the prompts in the current environment. Setting this can be useful for debugging a specific prompt.
+  - Example: `web-codegen-scorer eval --prompt-filter=tic-tac-toe --env=<config path>`
+
+- `--skip-screenshots`: Whether to skip taking screenshots of the generated app. Defaults to `false`.
+  - Example: `web-codegen-scorer eval --skip-screenshots --env=<config path>`
+
+- `--labels=<label1> <label2>`: Metadata labels that will be attached to the run.
+  - Example: `web-codegen-scorer eval --labels my-label another-label --env=<config path>`
+
+- `--mcp`: Whether to start an MCP for the evaluation. Defaults to `false`.
+  - Example: `web-codegen-scorer eval --mcp --env=<config path>`
+
+- `--help`: Prints out usage information about the script.
+
+## Local development
+
+If you've cloned this repo and want to work on the tool, you have to install its dependencies by running `pnpm install`.
+Once they're installed, you can run the following commands:
+
+* `pnpm run release-build` - Builds the package in the `dist` directory for publishing to npm.
+* `pnpm run eval` - Runs an eval from source.
+* `pnpm run report` - Runs the report app from source.
+* `pnpm run init` - Runs the init script from source.
+* `pnpm run format` - Formats the source code using Prettier.
@@ -0,0 +1,183 @@
+# Environment configuration reference
+
+Environments are configured by creating a `config.js` that exposes an object that satisfies the
+`EnvironmentConfig` interface. This document covers all the possible options in `EnvironmentConfig`
+and what they do.
+
+## Required properties
+
+These properties all have to be specified in order for the environment to function
+
+### `displayName`
+
+Human-readable name that will be shown in eval reports about this environment.
+
+### `id`
+
+Unique ID for the environment. If ommitted, one will be generated from the `displayName`.
+
+### `clientSideFramework`
+
+ID of the client-side framework that the environment will be running, for example `angular`.
+
+### `ratings`
+
+An array defining the ratings that will be executed as a part of the evaluation.
+The ratings determine what score that will be assigned to the test run.
+Currently we support the following types of ratings:
+
+- `PerBuildRating` - assigns a score based on the build result of the generated code, e.g.
+  "Does it build on the first run?" or "Does it build after X repair attempts?"
+- `PerFileRating` - assigns a score based on the content of individual files generated by the LLM.
+  Can be run either against all file types by setting the `filter` to
+  `PerFileRatingContentType.UNKNOWN` or against specific files.
+- `LLMBasedRating` - rates the generated code by asking an LLM to assign a score to it,
+  e.g. "Does this app match the specified prompts?"
+
+### `packageManager`
+
+Name of the package manager to use to install dependencies for the evaluated code.
+Supports `npm`, `pnpm` and `yarn`. Defaults to `npm`.
+
+### `generationSystemPrompt`
+
+Relative path to the system instructions that should be passed to the LLM when generating code.
+
+### `repairSystemPrompt`
+
+Relative path to the system instructions that should be passed to the LLM when repairing failures.
+
+### `executablePrompts`
+
+Configures the prompts that should be evaluated against the environment. Can contain either strings
+which represent glob patterns pointing to text files with the prompt's text
+(e.g. `./prompts/**/*.md`) or `MultiStepPrompt` objects ([see below](#multi-step-prompts)).
+The prompts can be shared between environments
+(e.g. `executablePrompts: ['../some-other-env/prompts/**/*.md']`).
+
+### `classifyPrompts`
+
+When enabled, the system prompts for this environment won't be included in the final report.
+This is useful when evaluating confidential code.
+
+### `skipInstall`
+
+Whether to skip installing dependencies during the eval run. This can be useful if you've already
+ensured that all dependencies are installed through something like pnpm workspaces.
+
+### Prompt templating
+
+Prompts are typically stored in `.md` files. We support the following template syntax inside of
+these files in order to augment the prompt and reduce boilerplate:
+
+- `{{> embed file='../path/to/file.md' }}` - embeds the content of the specified file in the
+  current one.
+- `{{> contextFiles '**/*.foo' }}` - specifies files that should be passed to the LLM as context
+  when the prompt is executed. Should be a comma-separated string of glob pattern **within** the
+  environments project code. E.g. `{{> contextFiles '**/*.ts, **/*.html' }}` will pass all `.ts`
+  and `.html` files as context.
+- `{{CLIENT_SIDE_FRAMEWORK_NAME}}` - insert the name of the client-side framework of the current
+  environment.
+- `{{FULL_STACK_FRAMEWORK_NAME}}` - insert the name of the full-stack framework of the current
+  environment.
+
+### Prompt-specific ratings
+
+If you want to run a set of ratings against a specific prompt, you can set an object literal
+in the `executablePrompts` array, instead of a string:
+
+```ts
+executablePrompts: [
+  // Runs only with the environment-level ratings.
+  './prompts/foo/*.md',
+
+  // Runs the ratings specific to the `contact-form.md`, as well as the environment-level ones.
+  {
+    path: './prompts/bar/contact-form.md',
+    ratings: contactFormSpecificRatings,
+  },
+];
+```
+
+### Multi-step prompts
+
+Multi-step prompts are prompts meant to evaluate workflows made up of one or more stages.
+Steps execute one after another **inside the same directory**, but are rated individually and
+snapshots after each step are stored in the final report. You can create a multi-step prompt by
+passing an instrance of the `MultiStepPrompt` class into the `executablePrompts` array, for example:
+
+```ts
+executablePrompts: [
+  new MultiStepPrompt('./prompts/about-page', {
+    'step-1': ratingsForFirstStep,
+    'step-2': [...ratingsForFirstStep, ratingsForSecondStep],
+  }),
+];
+```
+
+The first parameter is the directory from which to resolve the individual step prompts.
+All files in the directory **have to be named `step-{number}.md`**, for example:
+
+**my-env/prompts/about-page/step-1.md:**
+
+```
+Create an "About us" page.
+```
+
+**my-env/prompts/about-page/step-2.md:**
+
+```
+Add a contact form to the "About us" page
+```
+
+**my-env/prompts/about-page/step-3.md:**
+
+```
+Make it so submitting the contact form redirects the user back to the homepage.
+```
+
+The second parameter of `MultiStepPrompt` defines ratings that should be run only against specific
+steps. The key is the name of the step (e.g. `step-2`) while the value are the ratings that should
+run against it.
+
+## Optional properties
+
+These properties aren't required for the environment to run, but can be used to configure it further.
+
+### `sourceDirectory`
+
+Project into which the LLM-generated files will be placed, built, executed and evaluated.
+Can be an entire project or a handful of files that will be merged with the
+`projectTemplate` ([see below](#projecttemplate))
+
+### `projectTemplate`
+
+Used for reducing the boilerplate when setting up an environment, `projectTemplate` specifies the
+path of the project template that will be merged together with the files from `sourceDirectory` to
+create the final project structure that the evaluation will run against.
+
+For example, if the config has `projectTemplate: './templates/angular', sourceDirectory: './project'`,
+the eval runner will copy the files from `./templates/angular` into the output directory
+and then apply the files from `./project` on top of them, merging directories and replacing
+overlapping files.
+
+### `fullStackFramework`
+
+Name of the full-stack framework that is used in the evaluation, in addition to the
+`clientSideFramework`. If omitted, the `fullStackFramework` will be set to the same value as
+the `clientSideFramework`.
+
+### `mcpServers`
+
+IDs of Model Context Protocol servers that will be started and exposed to the LLM as a part of
+the evaluation.
+
+### `buildCommand`
+
+Command used to build the generated code as a part of the evaluation.
+Defaults to `<package manager> run build`.
+
+### `serveCommand`
+
+Command used to start a local dev server as a part of the evaluation.
+Defaults to `<package manager> run start --port 0`.
@@ -0,0 +1,9 @@
+# How to setup up a new LLM?
+
+If you want to test out a model that isn't yet available in the runner, you can add
+support for it by following these steps:
+
+1. Ensure that the provider of the model is supported by Genkit.
+2. Find the provider for the model in `runner/codegen/genkit/providers`. If the provider hasn't been implemented yet, do so by creating a new `GenkitModelProvider` and adding it to the `MODEL_PROVIDERS` in `runner/genkit/models.ts`.
+3. Add your model to the `GenkitModelProvider` configs.
+4. Done! 🎉 You can now run your model by passing `--model=<your model ID>`.
@@ -0,0 +1,12 @@
+import { getBuiltInRatings } from 'web-codegen-scorer';
+
+/** @type {import("web-codegen-scorer").EnvironmentConfig} */
+export default {
+  displayName: 'Angular (example)',
+  clientSideFramework: 'angular',
+  sourceDirectory: './project',
+  ratings: getBuiltInRatings(),
+  generationSystemPrompt: './system-instructions.md',
+  executablePrompts: ['../../prompts/**/*.md'],
+  packageManager: 'npm',
+};
@@ -0,0 +1,42 @@
+# See https://docs.github.com/get-started/getting-started-with-git/ignoring-files for more about ignoring files.
+
+# Compiled output
+/dist
+/tmp
+/out-tsc
+/bazel-out
+
+# Node
+/node_modules
+npm-debug.log
+yarn-error.log
+
+# IDEs and editors
+.idea/
+.project
+.classpath
+.c9/
+*.launch
+.settings/
+*.sublime-workspace
+
+# Visual Studio Code
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+.history/*
+
+# Miscellaneous
+/.angular/cache
+.sass-cache/
+/connect.lock
+/coverage
+/libpeerconnection.log
+testem.log
+/typings
+
+# System files
+.DS_Store
+Thumbs.db