amazon-science · gabriben · Nov 25, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025
diff --git a/.kiro/steering/product.md b/.kiro/steering/product.md
@@ -0,0 +1,39 @@
+# Product Overview
+
+## WRAVAL – WRiting Assist eVALuation
+
+WRAVAL is an evaluation framework for assessing Large Language Models (LLMs) and Small Language Models (SLMs) on writing assistant tasks. It focuses on non-reasoning tasks like tone transformation, summarization, and text improvement.
+
+### Purpose
+
+The framework addresses a gap in LM evaluation by focusing on practical writing assistant use cases rather than general reasoning tasks. It demonstrates that SLMs (under 10B parameters) can perform competitively on specific writing tasks despite scoring lower on general intelligence benchmarks.
+
+### Core Capabilities
+
+1. **Data Generation**: Synthetic dataset creation for various writing tasks using LLMs
+2. **Inference**: Running writing assistant tasks on both Bedrock-hosted and self-hosted models
+3. **Evaluation**: LLM-as-a-judge and human evaluation workflows
+4. **Deployment**: SageMaker endpoint deployment for custom models
+
+### Supported Writing Tasks (Tones)
+
+- **witty**: Transform factual sentences to witty versions
+- **professional**: Convert casual text to professional tone
+- **casual**: Make formal text more casual
+- **elaborate**: Expand simple sentences with detail
+- **shorten**: Condense wordy text
+- **improve**: Enhance poorly written sentences
+- **keypoints**: Extract key points from paragraphs
+- **proofread**: Correct errors in text
+- **emojify**: Add emojis to plain text
+- **summarize**: Create paragraph summaries
+
+### Target Users
+
+- ML practitioners evaluating SLMs for edge/private computing
+- Researchers benchmarking models on specific writing tasks
+- Teams implementing writing assistant features
+
+### Key Innovation
+
+The framework enables evaluation of models on tasks they excel at, rather than forcing comparison on general reasoning benchmarks where SLMs underperform.
diff --git a/.kiro/steering/structure.md b/.kiro/steering/structure.md
@@ -0,0 +1,123 @@
+# Project Structure
+
+## Directory Layout
+
+```
+wraval/
+├── config/
+│   └── settings.toml              # Model and AWS configuration
+├── data/                          # Generated datasets (timestamped CSVs)
+│   ├── clean/                     # Cleaned/processed datasets
+│   ├── old/                       # Archived datasets
+│   └── unique_queries/            # Deduplicated queries
+├── src/wraval/                    # Main package source
+│   ├── __init__.py
+│   ├── main.py                    # CLI entry point (Typer app)
+│   ├── aws_config.py              # AWS configuration and warning suppression
+│   ├── testing.py                 # Testing utilities
+│   ├── actions/                   # Core action modules
+│   │   ├── action_generate.py    # Data generation logic
+│   │   ├── action_inference.py   # Model inference execution
+│   │   ├── action_llm_judge.py   # LLM-as-a-judge evaluation
+│   │   ├── action_deploy.py      # SageMaker deployment
+│   │   ├── action_results.py     # Results visualization
+│   │   ├── action_examples.py    # Example display
+│   │   ├── action_human_judge_upload.py  # Human eval setup
+│   │   ├── action_human_judge_parsing.py # Human eval parsing
+│   │   ├── aws_utils.py          # AWS helper functions
+│   │   ├── completion.py         # Model completion wrappers
+│   │   ├── data_utils.py         # Data manipulation utilities
+│   │   ├── format.py             # Prompt formatting
+│   │   ├── model_router.py       # Model endpoint routing
+│   │   ├── prompt_tones.py       # Tone definitions and prompts
+│   │   ├── prompts_judge.py      # Judge evaluation prompts
+│   │   ├── data_generation_prompts.py  # Data gen prompts
+│   │   ├── read_random_lines.py  # Sampling utilities
+│   │   ├── cloudformation.yml    # CloudFormation templates
+│   │   ├── cloudformation_BedrockBatchInference.yml
+│   │   └── groundtruth_eval_template.html  # Human eval UI
+│   ├── custom_prompts/           # Custom prompt templates
+│   │   ├── data_generation_prompts.py
+│   │   ├── prompt_tones.py
+│   │   ├── prompts_judge.py
+│   │   ├── tone_prompts.py
+│   │   └── s3_transfer.sh        # S3 sync script
+│   └── model_artifacts/          # SageMaker deployment artifacts
+│       └── code/
+│           ├── inference.py      # SageMaker inference handler
+│           └── requirements.txt  # Model deployment deps
+├── resources/                     # Documentation and presentations
+├── build/                         # Build artifacts
+├── .ipynb_checkpoints/           # Jupyter notebook checkpoints
+├── pyproject.toml                # Package configuration
+├── setup.py                      # Setup script
+├── requirements.txt              # Pinned dependencies
+├── LICENSE-2.0.txt               # Apache 2.0 license
+├── NOTICE.txt                    # Copyright notice
+└── README.md                     # Project documentation
+```
+
+## Module Organization
+
+### Entry Point
+- **main.py**: CLI application using Typer with commands for each workflow step
+
+### Actions Module (`src/wraval/actions/`)
+Core functionality organized by workflow step:
+- **Generation**: `action_generate.py` - Creates synthetic datasets
+- **Inference**: `action_inference.py` - Runs models on datasets
+- **Evaluation**: `action_llm_judge.py` - Automated evaluation
+- **Deployment**: `action_deploy.py` - SageMaker endpoint management
+- **Human Eval**: `action_human_judge_*.py` - Human evaluation workflows
+- **Utilities**: Supporting modules for AWS, data, prompts, formatting
+
+### Custom Prompts (`src/wraval/custom_prompts/`)
+User-customizable prompt templates that override defaults when `--custom-prompts` flag is used.
+
+### Model Artifacts (`src/wraval/model_artifacts/`)
+SageMaker-specific deployment code:
+- `inference.py`: Custom inference handler for deployed models
+- `requirements.txt`: Runtime dependencies for deployed models
+
+## Configuration Files
+
+### settings.toml
+Environment-based configuration with model profiles:
+- `[default]`: Base settings (region, buckets, roles)
+- `[model-name]`: Model-specific configs (endpoint type, HF model name)
+- Supports string interpolation for AWS account/region
+
+### pyproject.toml
+Package metadata and dependencies:
+- Main dependencies in `dependencies` array
+- Optional GPU dependencies in `[project.optional-dependencies]`
+- Entry point: `wraval` command → `wraval.main:main`
+
+## Data Flow
+
+1. **Generation**: `wraval generate` → `data/all-{timestamp}.csv`
+2. **Inference**: Reads latest CSV → adds model outputs → saves updated CSV
+3. **Evaluation**: Reads CSV with outputs → adds judge scores → saves updated CSV
+4. **Human Eval**: Samples from CSV → uploads to S3 → creates SageMaker Ground Truth job
+
+## File Naming Conventions
+
+- **Datasets**: `all-{YYYYMMDD_HHMMSS}.csv` (timestamped)
+- **Actions**: `action_{verb}.py` (e.g., `action_generate.py`)
+- **Utilities**: `{noun}_utils.py` (e.g., `aws_utils.py`, `data_utils.py`)
+- **Prompts**: `{type}_prompts.py` or `prompt_{type}.py`
+
+## Import Patterns
+
+- Actions import from sibling modules: `from wraval.actions.{module} import {function}`
+- Main imports actions: `from wraval.actions.action_{name} import {function}`
+- Config loaded via dynaconf: `Dynaconf(settings_files=[...])`
+- AWS config imported first to suppress warnings: `from wraval.aws_config import *`
+
+## Key Architectural Patterns
+
+1. **CLI-driven**: All functionality exposed through Typer commands
+2. **Configuration-based**: Model behavior controlled via settings.toml profiles
+3. **Stateless actions**: Each action reads/writes CSV files independently
+4. **Pluggable prompts**: Custom prompts override defaults when specified
+5. **Multi-endpoint**: Unified interface for Bedrock, SageMaker, Ollama
diff --git a/.kiro/steering/tech.md b/.kiro/steering/tech.md
@@ -0,0 +1,107 @@
+# Technology Stack
+
+## Build System & Package Management
+
+- **Package Manager**: `uv` (modern Python package manager)
+- **Build System**: setuptools with pyproject.toml
+- **Python Version**: >=3.9
+
+## Core Dependencies
+
+### ML & AI Frameworks
+- **transformers** (4.48.1): HuggingFace transformers for model loading
+- **torch** (2.6.0): PyTorch for model inference
+- **accelerate**: Distributed training and inference
+- **bitsandbytes**: Quantization support (GPU optional dependency)
+
+### AWS Integration
+- **boto3**: AWS SDK for Python
+- **sagemaker** (2.236.0): SageMaker model deployment
+- **bedrock-runtime**: Bedrock model inference
+
+### Data & Utilities
+- **pandas** (2.2.3): Data manipulation
+- **datasets** (3.2.0): HuggingFace datasets
+- **dynaconf** (3.2.7): Configuration management
+- **typer**: CLI framework
+- **plotly** (5.24.1): Visualization
+- **beautifulsoup4**: HTML parsing
+
+## Configuration Management
+
+Configuration is managed via `dynaconf` with environment-based settings in `config/settings.toml`:
+- Model configurations (Bedrock, SageMaker, Ollama endpoints)
+- AWS region and account settings
+- S3 bucket paths for data and models
+- Endpoint types and model mappings
+
+## Common Commands
+
+### Installation
+```bash
+# Standard installation
+uv pip install .
+
+# With GPU support (requires CUDA)
+uv pip install ".[gpu]"
+```
+
+### CLI Commands
+```bash
+# Generate evaluation data
+wraval generate --model haiku-3 --type witty
+
+# Run inference on generated data
+wraval inference --model nova-lite --type all
+
+# Evaluate with LLM-as-a-judge
+wraval llm_judge --model haiku-3 --type professional
+
+# Deploy model to SageMaker
+wraval deploy-model --model phi-3-5-4B
+
+# Show examples from dataset
+wraval show-examples --model haiku-3 --type witty --n-examples 10
+
+# Upload for human evaluation
+wraval human-judge-upload --type all --n-samples 100
+
+# View results
+wraval show-results --type all
+```
+
+### Common Options
+- `--model, -m`: Model identifier from settings.toml
+- `--type, -t`: Tone type (witty, professional, casual, etc. or 'all')
+- `--upload-s3`: Upload results to S3
+- `--custom-prompts`: Use custom prompt templates
+- `--local-tokenizer-path`: Path to local tokenizer
+
+## Project Structure
+
+Entry point: `src/wraval/main.py` (CLI using Typer)
+
+Key modules:
+- `actions/`: Core functionality (generate, inference, judge, deploy)
+- `custom_prompts/`: Prompt templates for different tones
+- `model_artifacts/`: SageMaker deployment artifacts
+- `config/settings.toml`: Model and AWS configuration
+
+## Data Storage
+
+- **Local**: `./data/` directory with timestamped CSV files
+- **S3**: Configurable bucket paths for datasets and human evaluation
+- **Format**: CSV files with columns for input, output, model, tone, timestamps
+
+## Endpoint Types
+
+1. **bedrock**: AWS Bedrock hosted models (Claude, Nova)
+2. **sagemaker**: Self-hosted models on SageMaker endpoints
+3. **ollama**: Local Ollama endpoints (for development)
+
+## Development Notes
+
+- AWS credentials required for Bedrock/SageMaker operations
+- GPU support needed for model deployment (`bitsandbytes` dependency)
+- Configuration uses string formatting for AWS account/region injection
+- All CLI commands support `--help` for detailed usage
diff --git a/config/settings.toml b/config/settings.toml
@@ -1,10 +1,12 @@
 [default]
-region = 'us-east-1'
-data_dir = 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/'
-# "./data"
-deploy_bucket_name = 'llm-finetune-us-east-1-{aws_account}'
+# region = 'us-east-1'
+region = 'us-west-2'
+deploy_bucket_name = 'llm-finetune-{region}-{aws_account}'
+data_dir = 's3://llm-finetune-{region}-{aws_account}/eval/tones/'
+# data_dir = './data/'
+human_eval_dir = 's3://llm-finetune-{region}-{aws_account}/human_eval/tones/'
 deploy_bucket_prefix = 'models'
-sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-us-east-1'
+sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-{region}'
 endpoint_type = 'bedrock'
 model = 'anthropic.claude-3-haiku-20240307-v1:0'
 
@@ -37,11 +39,42 @@ model = 'Phi-3-5-mini-instruct'
 hf_name = 'microsoft/Phi-3.5-mini-instruct'
 endpoint_type = 'sagemaker'
 
+[qwen-3-0-6B]
+model = 'Qwen3-0-6B'
+hf_name = 'Qwen/Qwen3-0.6B' # instruct is now this, and base is appended with 'base'
+endpoint_type = 'sagemaker'
+thinking = false
+
 [qwen-2-5-1-5B]
 model = 'Qwen2-5-1-5B-Instruct'
 hf_name = 'Qwen/Qwen2.5-1.5B-Instruct'
 endpoint_type = 'sagemaker'
 
+[qwen-3-1-7B]
+model = 'Qwen3-1-7B'
+hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base'
+endpoint_type = 'sagemaker'
+thinking = false
+
+[qwen-3-1-7B-async]
+model = 'Qwen3-1-7B-async'
+hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base'
+endpoint_type = 'sagemaker'
+thinking = false
+asynchronous = true
+
+[qwen-3-4B]
+model = 'Qwen3-4B'
+hf_name = 'Qwen/Qwen3-4B-Instruct-2507'
+endpoint_type = 'sagemaker'
+
+[qwen-3-4B-async]
+model = 'Qwen3-4B-Async'
+hf_name = 'Qwen/Qwen3-4B-Instruct-2507'
+endpoint_type = 'sagemaker'
+thinking = false
+asynchronous = true
+
 [phi-3-ollama]
 model = 'phi3'
 hf_name = 'microsoft/Phi-3.5-mini-instruct'

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,8 +19,6 @@ dependencies = [
     "boto3",
     "plotly~=5.24.1",
     "transformers==4.51.0",
-    "datasets~=3.2.0",
-    "evaluate~=0.4.3",
     "dynaconf~=3.2.7",
     "torch",
     "botocore",

diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py
@@ -6,6 +6,7 @@
 import boto3
 import torch
 from sagemaker.huggingface import HuggingFaceModel
+from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 
 PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -80,7 +81,7 @@ def write_model_to_s3(settings, model_name):
     return s3_uri
 
 
-def deploy_endpoint(s3_uri, role, endpoint_name):
+def deploy_endpoint(s3_uri, role, endpoint_name, async_config=None):
     env = {
         "HF_TASK": "text-generation",
         "HF_HUB_OFFLINE": "1",
@@ -100,13 +101,14 @@ def deploy_endpoint(s3_uri, role, endpoint_name):
         initial_instance_count=1,
         instance_type="ml.g5.2xlarge",
         endpoint_name=endpoint_name,
+        async_inference_config=async_config,
     )
 
 
 def validate_deployment(predictor):
     try:
         sagemaker_runtime_client = boto3.client("sagemaker-runtime")
-        input_string = json.dumps({"inputs": "Hello, my dog is a little"})
+        input_string = json.dumps({"inputs": "<|im_start|>user\nHello, can you pass me the milk?<|im_end|>\n<|im_start|>assistant\n"})
         response = sagemaker_runtime_client.invoke_endpoint(
             EndpointName=predictor.endpoint_name,
             Body=input_string.encode("utf-8"),
@@ -142,10 +144,13 @@ def cleanup_model_directory():
 def deploy(settings):
     validate_model_directory()
     cleanup_model_directory()
-    sanitized_model_name = settings.hf_name.split("/")[1].replace(".", "-")
+    sanitized_model_name = settings.model.replace(".", "-")
     load_artifacts(settings)
     s3_uri = write_model_to_s3(settings, sanitized_model_name)
+    async_config = None
+    if settings.exists('asynchronous'):
+        async_config = AsyncInferenceConfig()
     predictor = deploy_endpoint(
-        s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name
+        s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name, async_config
     )
     validate_deployment(predictor)