From 5d329ce0febea754f8179f8d4669514c139a7fb3 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Wed, 20 Aug 2025 15:11:59 +0200
Subject: [PATCH 01/24] deploy fix and add qwen 1.7

---
 config/settings.toml | 11 +++++++++--
 pyproject.toml       |  1 -
 src/wraval/main.py   |  8 +++++++-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/config/settings.toml b/config/settings.toml
index fbf1d08..7792f63 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -1,7 +1,8 @@
 [default]
-region = 'us-east-1'
+region = 'us-west-2'
 data_dir = 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/'
-# "./data"
+# 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/'
+human_eval_dir = 's3://llm-finetune-us-east-1-{aws_account}/human_eval/tones/'
 deploy_bucket_name = 'llm-finetune-us-east-1-{aws_account}'
 deploy_bucket_prefix = 'models'
 sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-us-east-1'
@@ -42,6 +43,12 @@ model = 'Qwen2-5-1-5B-Instruct'
 hf_name = 'Qwen/Qwen2.5-1.5B-Instruct'
 endpoint_type = 'sagemaker'
 
+[qwen-3-1-7B]
+model = 'Qwen3-1.7B-Instruct'
+hf_name = 'Qwen/Qwen3-1.7B'
+# instruct is now this, and base is appended with 'base'
+endpoint_type = 'sagemaker'
+
 [phi-3-ollama]
 model = 'phi3'
 hf_name = 'microsoft/Phi-3.5-mini-instruct'
diff --git a/pyproject.toml b/pyproject.toml
index f12bce3..9b9c596 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,6 @@ dependencies = [
     "boto3",
     "plotly~=5.24.1",
     "transformers==4.51.0",
-    "datasets~=3.2.0",
     "evaluate~=0.4.3",
     "dynaconf~=3.2.7",
     "torch",
diff --git a/src/wraval/main.py b/src/wraval/main.py
index b0e269e..9a32c20 100644
--- a/src/wraval/main.py
+++ b/src/wraval/main.py
@@ -228,7 +228,13 @@ def human_judge_upload(
 
 
 @app.command()
-def deploy(
+def human_judge_parsing():
+    """Parse human judgments, merge it to the original results table and create a plot."""
+    settings = get_settings()
+    parse_human_judgements(settings)
+
+@app.command()
+def deploy_model(
     model: str = typer.Option("haiku-3", "--model", "-m", help="Model to deploy"),
     cleanup_endpoints: bool = typer.Option(
         False,

From 524e66cc59e3949d7a521d32556bb80c8a1b5a6a Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Wed, 20 Aug 2025 15:20:41 +0200
Subject: [PATCH 02/24] remove bleu score as not in use for now

---
 pyproject.toml                  |  1 -
 src/wraval/actions/bleu_conf.py | 49 ---------------------------------
 2 files changed, 50 deletions(-)
 delete mode 100644 src/wraval/actions/bleu_conf.py

diff --git a/pyproject.toml b/pyproject.toml
index 9b9c596..2c9593f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,6 @@ dependencies = [
     "boto3",
     "plotly~=5.24.1",
     "transformers==4.51.0",
-    "evaluate~=0.4.3",
     "dynaconf~=3.2.7",
     "torch",
     "botocore",
diff --git a/src/wraval/actions/bleu_conf.py b/src/wraval/actions/bleu_conf.py
deleted file mode 100644
index 5738d6e..0000000
--- a/src/wraval/actions/bleu_conf.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#
-# // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# // SPDX-License-Identifier: Apache-2.0
-#
-import random
-import numpy as np
-from evaluate import load
-
-# Load BLEU metric
-bleu = load("bleu")
-
-
-def compute_bleu_with_ci(
-    predictions, references, num_bootstrap_samples=1000, confidence_level=0.95
-):
-    # Compute the original BLEU score
-    original_bleu = bleu.compute(predictions=predictions, references=references)["bleu"]
-
-    # Bootstrap sampling
-    bootstrap_scores = []
-    n = len(predictions)
-
-    for _ in range(num_bootstrap_samples):
-        # Sample indices with replacement
-        indices = [random.randint(0, n - 1) for _ in range(n)]
-        sampled_predictions = [predictions[i] for i in indices]
-        sampled_references = [references[i] for i in indices]
-
-        # Compute BLEU for the bootstrap sample
-        score = bleu.compute(
-            predictions=sampled_predictions, references=sampled_references
-        )["bleu"]
-        bootstrap_scores.append(score)
-
-    # Calculate confidence intervals
-    lower_bound = np.percentile(bootstrap_scores, (1 - confidence_level) / 2 * 100)
-    upper_bound = np.percentile(bootstrap_scores, (1 + confidence_level) / 2 * 100)
-
-    return {"bleu": original_bleu, "confidence_interval": (lower_bound, upper_bound)}
-
-
-# Example usage
-predictions = ["This is a test", "Another sentence"]
-references = [["This is a test"], ["Another sentence"]]
-
-results = compute_bleu_with_ci(predictions, references)
-
-print(f"BLEU Score: {results['bleu']}")
-print(f"95% Confidence Interval: {results['confidence_interval']}")

From 2f4c637e51b47aeb040dd499e9beb9863ecc82fe Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Wed, 20 Aug 2025 15:23:45 +0200
Subject: [PATCH 03/24] move the comment in settings toml

---
 config/settings.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/config/settings.toml b/config/settings.toml
index 7792f63..53a9d49 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -45,8 +45,7 @@ endpoint_type = 'sagemaker'
 
 [qwen-3-1-7B]
 model = 'Qwen3-1.7B-Instruct'
-hf_name = 'Qwen/Qwen3-1.7B'
-# instruct is now this, and base is appended with 'base'
+hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base'
 endpoint_type = 'sagemaker'
 
 [phi-3-ollama]

From b5f05d624ce3bba7bfcedeeec65c97120993fac5 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Wed, 20 Aug 2025 17:40:26 +0200
Subject: [PATCH 04/24] using dynaconf variable interpolation

---
 config/settings.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/settings.toml b/config/settings.toml
index 53a9d49..259dde2 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -1,9 +1,9 @@
 [default]
 region = 'us-west-2'
-data_dir = 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/'
+deploy_bucket_name = 'llm-finetune-{region}-{aws_account}'
+data_dir = 's3://{deploy_bucket_name}/eval/tones/'
 # 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/'
-human_eval_dir = 's3://llm-finetune-us-east-1-{aws_account}/human_eval/tones/'
-deploy_bucket_name = 'llm-finetune-us-east-1-{aws_account}'
+human_eval_dir = 's3://{deploy_bucket_name}/human_eval/tones/'
 deploy_bucket_prefix = 'models'
 sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-us-east-1'
 endpoint_type = 'bedrock'

From 7519af6093902889cfc0c86007f137b5efe570c7 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Wed, 20 Aug 2025 17:41:03 +0200
Subject: [PATCH 05/24] using dynaconf variable interpolation for sagemaker
 role too

---
 config/settings.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/settings.toml b/config/settings.toml
index 259dde2..510d453 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -5,7 +5,7 @@ data_dir = 's3://{deploy_bucket_name}/eval/tones/'
 # 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/'
 human_eval_dir = 's3://{deploy_bucket_name}/human_eval/tones/'
 deploy_bucket_prefix = 'models'
-sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-us-east-1'
+sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-{region}'
 endpoint_type = 'bedrock'
 model = 'anthropic.claude-3-haiku-20240307-v1:0'
 

From 837d55f64adb88e286f535f194f42a65fa29d957 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Wed, 20 Aug 2025 17:42:36 +0200
Subject: [PATCH 06/24] using dynaconf variable interpolation with only one
 nesting

---
 config/settings.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/settings.toml b/config/settings.toml
index 510d453..b6394db 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -1,9 +1,9 @@
 [default]
 region = 'us-west-2'
 deploy_bucket_name = 'llm-finetune-{region}-{aws_account}'
-data_dir = 's3://{deploy_bucket_name}/eval/tones/'
+data_dir = 's3://llm-finetune-{region}-{aws_account}/eval/tones/'
 # 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/'
-human_eval_dir = 's3://{deploy_bucket_name}/human_eval/tones/'
+human_eval_dir = 's3://llm-finetune-{region}-{aws_account}/human_eval/tones/'
 deploy_bucket_prefix = 'models'
 sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-{region}'
 endpoint_type = 'bedrock'

From ba7ff24b9ec757172fd36b8253c83a317bcc1e77 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Wed, 20 Aug 2025 17:53:11 +0200
Subject: [PATCH 07/24] dropping dynaconf variable interpolation but keeping
 region flexible in python

---
 src/wraval/main.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/wraval/main.py b/src/wraval/main.py
index 9a32c20..3a81d49 100644
--- a/src/wraval/main.py
+++ b/src/wraval/main.py
@@ -62,6 +62,11 @@ def get_settings(
     if local_tokenizer_path:
         settings.local_tokenizer_path = local_tokenizer_path
 
+    settings.deploy_bucket_name = settings.deploy_bucket_name.format(region=settings.region, aws_account=settings.aws_account)
+    settings.data_dir = settings.data_dir.format(region=settings.region, aws_account=settings.aws_account)
+    settings.human_eval_dir = settings.human_eval_dir.format(region=settings.region, aws_account=settings.aws_account)
+    settings.sagemaker_execution_role_arn = settings.sagemaker_execution_role_arn.format(region=settings.region, aws_account=settings.aws_account)    
+
     # Format settings with AWS account
     settings.model = settings.model.format(aws_account=settings.aws_account)
     settings.data_dir = settings.data_dir.format(aws_account=settings.aws_account)

From 0b1463ff3e08f860a6f27234cc792346a38203d3 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Thu, 21 Aug 2025 17:19:34 +0200
Subject: [PATCH 08/24] no_think input and output

---
 config/settings.toml                   |  4 +++-
 src/wraval/actions/action_deploy.py    |  2 +-
 src/wraval/actions/action_inference.py |  3 ++-
 src/wraval/actions/completion.py       | 24 ++++++++++++++++++------
 src/wraval/actions/format.py           |  7 +++++--
 src/wraval/actions/model_router.py     |  9 +++++++--
 6 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/config/settings.toml b/config/settings.toml
index b6394db..67603ae 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -1,4 +1,5 @@
 [default]
+# region = 'us-east-1'
 region = 'us-west-2'
 deploy_bucket_name = 'llm-finetune-{region}-{aws_account}'
 data_dir = 's3://llm-finetune-{region}-{aws_account}/eval/tones/'
@@ -44,9 +45,10 @@ hf_name = 'Qwen/Qwen2.5-1.5B-Instruct'
 endpoint_type = 'sagemaker'
 
 [qwen-3-1-7B]
-model = 'Qwen3-1.7B-Instruct'
+model = 'Qwen3-1-7B'
 hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base'
 endpoint_type = 'sagemaker'
+thinking = false
 
 [phi-3-ollama]
 model = 'phi3'
diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py
index 8cd2aa0..273d4ee 100644
--- a/src/wraval/actions/action_deploy.py
+++ b/src/wraval/actions/action_deploy.py
@@ -106,7 +106,7 @@ def deploy_endpoint(s3_uri, role, endpoint_name):
 def validate_deployment(predictor):
     try:
         sagemaker_runtime_client = boto3.client("sagemaker-runtime")
-        input_string = json.dumps({"inputs": "Hello, my dog is a little"})
+        input_string = json.dumps({"inputs": "<|im_start|>user\nHello, can you pass me the milk?<|im_end|>\n<|im_start|>assistant\n"})
         response = sagemaker_runtime_client.invoke_endpoint(
             EndpointName=predictor.endpoint_name,
             Body=input_string.encode("utf-8"),
diff --git a/src/wraval/actions/action_inference.py b/src/wraval/actions/action_inference.py
index b9ee95d..75a8394 100644
--- a/src/wraval/actions/action_inference.py
+++ b/src/wraval/actions/action_inference.py
@@ -45,8 +45,9 @@ def run_inference(
         print(f"Processing {len(queries)} inputs for tone: {tone}")
 
         outputs = route_completion(settings, queries, tone_prompt)
-
+        
         cleaned_output = [o.strip().strip('"') for o in outputs]
+        
         if no_rewrite:
             mask = results["tone"] == tone
             results.loc[mask, "rewrite"] = cleaned_output
diff --git a/src/wraval/actions/completion.py b/src/wraval/actions/completion.py
index 41951bf..f82e3e2 100644
--- a/src/wraval/actions/completion.py
+++ b/src/wraval/actions/completion.py
@@ -15,13 +15,25 @@
 
 # Function to extract last assistant response from each entry
 def extract_last_assistant_response(data):
-    matches = re.findall(r"<\|assistant\|>(.*?)<\|end\|>", data, re.DOTALL)
-    # matches = re.findall(r"<assistant>(.*?)</assistant>", data, re.DOTALL)
-    if matches:
-        return matches[-1].strip()
-    else:
-        return data
 
+    if r"<\|assistant\|>" in data: # phi
+        assistant_part = data.split(r"<\|assistant\|>")[-1]
+        response = response.replace(r"<\|end\|>", "").strip()
+        return response
+        
+    if r"<|im_start|>assistant" in data: # qwen
+        assistant_part = data.split(r"<|im_start|>assistant")[-1]
+        
+        # Remove the thinking part if it exists
+        if r"<think>" in assistant_part:
+            # Extract everything after </think>
+            response = assistant_part.split(r"</think>")[-1]
+        else:
+            response = assistant_part
+        response = response.replace(r"<|im_end|>", "").strip()
+        return response
+    
+    return data
 
 def get_bedrock_completion(settings, prompt, system_prompt=None):
     bedrock_client = boto3.client(
diff --git a/src/wraval/actions/format.py b/src/wraval/actions/format.py
index d789682..25056db 100644
--- a/src/wraval/actions/format.py
+++ b/src/wraval/actions/format.py
@@ -6,7 +6,7 @@
 import xml.dom.minidom
 
 
-def format_prompt(usr_prompt, prompt=None, tokenizer=None, type="bedrock"):
+def format_prompt(usr_prompt, prompt=None, tokenizer=None, type="bedrock", thinking=None):
     """
     Format prompts according to each model's prompt guidelines (e.g. xml tags for Haiku).
 
@@ -18,7 +18,10 @@ def format_prompt(usr_prompt, prompt=None, tokenizer=None, type="bedrock"):
 
     if type == "hf":
         if prompt:
-            sys_prompt = [{"role": "system", "content": prompt.sys_prompt}]
+            if thinking is None or True:
+                sys_prompt = [{"role": "system", "content": prompt.sys_prompt}]
+            else:
+                sys_prompt = [{"role": "system", "content": prompt.sys_prompt + '/no_think'}]
             messages = []
             if prompt.examples:
                 for k, v in prompt.examples[0].items():
diff --git a/src/wraval/actions/model_router.py b/src/wraval/actions/model_router.py
index 40a8a77..a404be4 100644
--- a/src/wraval/actions/model_router.py
+++ b/src/wraval/actions/model_router.py
@@ -45,14 +45,19 @@ class SageMakerRouter(HuggingFaceModelRouter):
     def __init__(self, master_sys_prompt, settings):
         super().__init__(master_sys_prompt, settings)
         self.model_name = settings.model
+        self.region = settings.region
+        if settings.exists('thinking'):
+            self.thinking = settings.thinking
+        else:
+            self.thinking = None
 
     def get_completion(self, queries: List[str]) -> List[str]:
         prompts = [
-            format_prompt(text, self.master_sys_prompt, self.tokenizer, type="hf")
+            format_prompt(text, self.master_sys_prompt, self.tokenizer, "hf", self.thinking)
             for text in queries
         ]
         return [
-            invoke_sagemaker_endpoint({"inputs": prompt}, self.model_name) for prompt in tqdm(prompts)
+            invoke_sagemaker_endpoint({"inputs": prompt}, self.model_name, self.region) for prompt in tqdm(prompts)
         ]
 
 

From b58fd5c1c2d683672109ea0e37c91d914890ea5a Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Fri, 22 Aug 2025 11:29:55 +0200
Subject: [PATCH 09/24] try async

---
 config/settings.toml                   |  7 +++++++
 src/wraval/actions/action_deploy.py    | 13 +++++++++++--
 src/wraval/actions/action_inference.py |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/config/settings.toml b/config/settings.toml
index 67603ae..d320bee 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -50,6 +50,13 @@ hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'b
 endpoint_type = 'sagemaker'
 thinking = false
 
+[qwen-3-1-7B-async]
+model = 'Qwen3-1-7B'
+hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base'
+endpoint_type = 'sagemaker'
+thinking = false
+async = true
+
 [phi-3-ollama]
 model = 'phi3'
 hf_name = 'microsoft/Phi-3.5-mini-instruct'
diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py
index 273d4ee..3f009d4 100644
--- a/src/wraval/actions/action_deploy.py
+++ b/src/wraval/actions/action_deploy.py
@@ -6,6 +6,7 @@
 import boto3
 import torch
 from sagemaker.huggingface import HuggingFaceModel
+from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 
 PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -80,7 +81,7 @@ def write_model_to_s3(settings, model_name):
     return s3_uri
 
 
-def deploy_endpoint(s3_uri, role, endpoint_name):
+def deploy_endpoint(s3_uri, role, endpoint_name, async_config=None):
     env = {
         "HF_TASK": "text-generation",
         "HF_HUB_OFFLINE": "1",
@@ -100,6 +101,7 @@ def deploy_endpoint(s3_uri, role, endpoint_name):
         initial_instance_count=1,
         instance_type="ml.g5.2xlarge",
         endpoint_name=endpoint_name,
+        async_inference_config=async_config,
     )
 
 
@@ -145,7 +147,14 @@ def deploy(settings):
     sanitized_model_name = settings.hf_name.split("/")[1].replace(".", "-")
     load_artifacts(settings)
     s3_uri = write_model_to_s3(settings, sanitized_model_name)
+    if settings.exists('async'):
+        async_config = AsyncInferenceConfig(
+            max_concurrency=1000,
+            max_invocations=1000,
+            max_payload_in_mb=1000
+        )
+
     predictor = deploy_endpoint(
-        s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name
+        s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name, async_config
     )
     validate_deployment(predictor)
diff --git a/src/wraval/actions/action_inference.py b/src/wraval/actions/action_inference.py
index 75a8394..e1bfffd 100644
--- a/src/wraval/actions/action_inference.py
+++ b/src/wraval/actions/action_inference.py
@@ -40,7 +40,7 @@ def run_inference(
 
         tone_prompt = get_prompt(Tone(tone))
 
-        queries = results[results["tone"] == tone]["synthetic_data"]
+        queries = results[results["tone"] == tone]["synthetic_data"].unique()
 
         print(f"Processing {len(queries)} inputs for tone: {tone}")
 

From 8bdcd9cac7966f2049e2d319180e5ee811a023f5 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Fri, 22 Aug 2025 11:52:24 +0200
Subject: [PATCH 10/24] empty async config

---
 src/wraval/actions/action_deploy.py    | 7 +------
 src/wraval/actions/action_inference.py | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py
index 3f009d4..29b47f6 100644
--- a/src/wraval/actions/action_deploy.py
+++ b/src/wraval/actions/action_deploy.py
@@ -148,12 +148,7 @@ def deploy(settings):
     load_artifacts(settings)
     s3_uri = write_model_to_s3(settings, sanitized_model_name)
     if settings.exists('async'):
-        async_config = AsyncInferenceConfig(
-            max_concurrency=1000,
-            max_invocations=1000,
-            max_payload_in_mb=1000
-        )
-
+        async_config = AsyncInferenceConfig()
     predictor = deploy_endpoint(
         s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name, async_config
     )
diff --git a/src/wraval/actions/action_inference.py b/src/wraval/actions/action_inference.py
index e1bfffd..b8f0c19 100644
--- a/src/wraval/actions/action_inference.py
+++ b/src/wraval/actions/action_inference.py
@@ -54,7 +54,7 @@ def run_inference(
             results.loc[mask, "inference_model"] = model_name
         else:
             new_results = pd.DataFrame(
-                {"synthetic_data": results[results["tone"] == tone]["synthetic_data"]}
+                {"synthetic_data": results[results["tone"] == tone]["synthetic_data"].unique()}
             )
             new_results["tone"] = tone
             new_results["rewrite"] = cleaned_output

From 9ac330f9af2b47feed79bfd873aaade0c38789fd Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Fri, 22 Aug 2025 12:01:08 +0200
Subject: [PATCH 11/24] different async qwen name

---
 config/settings.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/settings.toml b/config/settings.toml
index d320bee..e8e0cef 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -51,7 +51,7 @@ endpoint_type = 'sagemaker'
 thinking = false
 
 [qwen-3-1-7B-async]
-model = 'Qwen3-1-7B'
+model = 'Qwen3-1-7B-async'
 hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base'
 endpoint_type = 'sagemaker'
 thinking = false

From 7c3471587e4c7057e01c1f068925004440905c7d Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Fri, 22 Aug 2025 12:09:12 +0200
Subject: [PATCH 12/24] use model name and not hf name as sagemaker endpoint
 name

---
 src/wraval/actions/action_deploy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py
index 29b47f6..2c6f0e5 100644
--- a/src/wraval/actions/action_deploy.py
+++ b/src/wraval/actions/action_deploy.py
@@ -144,7 +144,7 @@ def cleanup_model_directory():
 def deploy(settings):
     validate_model_directory()
     cleanup_model_directory()
-    sanitized_model_name = settings.hf_name.split("/")[1].replace(".", "-")
+    sanitized_model_name = settings.model.split("/")[1].replace(".", "-")
     load_artifacts(settings)
     s3_uri = write_model_to_s3(settings, sanitized_model_name)
     if settings.exists('async'):

From e132f59b369f8e8542c767bfeb023d9102c83435 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Fri, 22 Aug 2025 12:10:50 +0200
Subject: [PATCH 13/24] model name has no slash unlike hf name

---
 src/wraval/actions/action_deploy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py
index 2c6f0e5..61e5380 100644
--- a/src/wraval/actions/action_deploy.py
+++ b/src/wraval/actions/action_deploy.py
@@ -144,7 +144,7 @@ def cleanup_model_directory():
 def deploy(settings):
     validate_model_directory()
     cleanup_model_directory()
-    sanitized_model_name = settings.model.split("/")[1].replace(".", "-")
+    sanitized_model_name = settings.model.replace(".", "-")
     load_artifacts(settings)
     s3_uri = write_model_to_s3(settings, sanitized_model_name)
     if settings.exists('async'):

From 019d5619ad2081e1a21a7be19cc42d03bc19d46d Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Mon, 25 Aug 2025 11:58:10 +0200
Subject: [PATCH 14/24] add qwen 3 4B

---
 config/settings.toml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/config/settings.toml b/config/settings.toml
index e8e0cef..8cb835d 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -3,7 +3,7 @@
 region = 'us-west-2'
 deploy_bucket_name = 'llm-finetune-{region}-{aws_account}'
 data_dir = 's3://llm-finetune-{region}-{aws_account}/eval/tones/'
-# 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/'
+# data_dir = './data/'
 human_eval_dir = 's3://llm-finetune-{region}-{aws_account}/human_eval/tones/'
 deploy_bucket_prefix = 'models'
 sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-{region}'
@@ -57,6 +57,11 @@ endpoint_type = 'sagemaker'
 thinking = false
 async = true
 
+[qwen-3-4B]
+model = 'Qwen3-4B'
+hf_name = 'Qwen/Qwen3-4B-Instruct-2507' # this finetune is non-thinking only
+endpoint_type = 'sagemaker'
+
 [phi-3-ollama]
 model = 'phi3'
 hf_name = 'microsoft/Phi-3.5-mini-instruct'

From ba7bdbc6d624dcb4e1c616027728b36beed03563 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Mon, 25 Aug 2025 12:04:58 +0200
Subject: [PATCH 15/24] async config fix

---
 src/wraval/actions/action_deploy.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py
index 61e5380..fb5748e 100644
--- a/src/wraval/actions/action_deploy.py
+++ b/src/wraval/actions/action_deploy.py
@@ -147,6 +147,7 @@ def deploy(settings):
     sanitized_model_name = settings.model.replace(".", "-")
     load_artifacts(settings)
     s3_uri = write_model_to_s3(settings, sanitized_model_name)
+    async_config = None
     if settings.exists('async'):
         async_config = AsyncInferenceConfig()
     predictor = deploy_endpoint(

From 815a24e0abfb209d2d979a3b9df50822e9226a4b Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Mon, 25 Aug 2025 14:32:58 +0200
Subject: [PATCH 16/24] small fixes and first attempt at a batch endpoint

---
 config/settings.toml                |   2 +-
 src/wraval/actions/action_deploy.py |   2 +-
 src/wraval/actions/completion.py    | 142 ++++++++++++++++++++++++++++
 src/wraval/actions/format.py        |   2 +-
 src/wraval/actions/model_router.py  |  18 +++-
 5 files changed, 160 insertions(+), 6 deletions(-)

diff --git a/config/settings.toml b/config/settings.toml
index 8cb835d..c2802fd 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -55,7 +55,7 @@ model = 'Qwen3-1-7B-async'
 hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base'
 endpoint_type = 'sagemaker'
 thinking = false
-async = true
+asynchronous = true
 
 [qwen-3-4B]
 model = 'Qwen3-4B'
diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py
index fb5748e..1ff18ff 100644
--- a/src/wraval/actions/action_deploy.py
+++ b/src/wraval/actions/action_deploy.py
@@ -148,7 +148,7 @@ def deploy(settings):
     load_artifacts(settings)
     s3_uri = write_model_to_s3(settings, sanitized_model_name)
     async_config = None
-    if settings.exists('async'):
+    if settings.exists('asynchronous'):
         async_config = AsyncInferenceConfig()
     predictor = deploy_endpoint(
         s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name, async_config
diff --git a/src/wraval/actions/completion.py b/src/wraval/actions/completion.py
index f82e3e2..e2699b0 100644
--- a/src/wraval/actions/completion.py
+++ b/src/wraval/actions/completion.py
@@ -11,6 +11,7 @@
 import boto3
 import re
 import requests
+import uuid
 
 
 # Function to extract last assistant response from each entry
@@ -232,3 +233,144 @@ def invoke_ollama_endpoint(payload, endpoint_name, url="127.0.0.1:11434"):
             lines.append(json.loads(r))
 
     return "".join([l["response"] for l in lines])
+
+
+def batch_invoke_sagemaker_endpoint(
+    payloads,
+    endpoint_name,
+    region="us-east-1",
+    s3_bucket=None,
+    s3_input_prefix="/eval/async/input/",
+    poll_interval_seconds=10,
+    timeout_seconds=600,
+):
+    """
+    Invoke a SageMaker async endpoint for a batch of payloads.
+
+    - payloads: list of JSON-serializable objects (each is one request)
+    - endpoint_name: name of the async SageMaker endpoint
+    - region: AWS region
+    - s3_bucket: S3 bucket to upload inputs (required)
+    - s3_input_prefix: S3 prefix for input uploads
+    - poll_interval_seconds: interval between checks for output readiness
+    - timeout_seconds: max time to wait for each result
+
+    Returns list of raw results (strings) in the same order as payloads.
+    """
+    if s3_bucket is None:
+        raise ValueError("s3_bucket is required for async invocations")
+    if not isinstance(s3_bucket, str) or not s3_bucket.strip():
+        raise ValueError(
+            "s3_bucket must be a non-empty string (e.g., 'my-bucket-name'), got: "
+            f"{type(s3_bucket).__name__}"
+        )
+
+    sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=region)
+    s3_client = boto3.client("s3", region_name=region)
+
+    # Normalize prefix
+    input_prefix = s3_input_prefix.lstrip("/")
+
+    input_locations = []
+    output_locations = []
+    inference_ids = []
+
+    # 1) Upload all payloads and invoke async endpoint
+    for idx, payload in enumerate(payloads):
+        print(f"Submitting {idx + 1}/{len(payloads)} to async endpoint '{endpoint_name}'...")
+        request_id = str(uuid.uuid4())[:8]
+        input_key = f"{input_prefix}batch-{request_id}-{idx}.json"
+
+        # Ensure payload is in expected format for the model container
+        if isinstance(payload, str):
+            payload_to_upload = {"inputs": payload}
+        elif isinstance(payload, list) and all(isinstance(p, str) for p in payload):
+            payload_to_upload = {"inputs": payload}
+        elif isinstance(payload, dict):
+            payload_to_upload = payload
+        else:
+            # Fallback: wrap unknown types under inputs
+            payload_to_upload = {"inputs": payload}
+
+        s3_client.put_object(
+            Bucket=s3_bucket,
+            Key=input_key,
+            Body=json.dumps(payload_to_upload),
+            ContentType="application/json",
+        )
+
+        input_location = f"s3://{s3_bucket}/{input_key}"
+        input_locations.append(input_location)
+
+        response = sagemaker_runtime.invoke_endpoint_async(
+            EndpointName=endpoint_name,
+            InputLocation=input_location,
+            ContentType="application/json",
+            InvocationTimeoutSeconds=3600,
+        )
+
+        output_locations.append(response["OutputLocation"])  # s3 uri
+        inference_ids.append(response.get("InferenceId"))
+        print(f"Submitted {idx + 1}/{len(payloads)}. Output will be written to {response['OutputLocation']}")
+
+    # 2) Poll for each output and download results
+    results = []
+    for i, output_location in enumerate(output_locations):
+        start_time = time.time()
+
+        # Parse s3 uri and derive expected result key: <prefix>/<InferenceId>.out
+        uri = output_location.replace("s3://", "")
+        bucket, key = uri.split("/", 1)
+        inference_id = inference_ids[i]
+        expected_key = f"{key.rstrip('/')}/{inference_id}.out" if isinstance(inference_id, str) and inference_id else key
+        if expected_key != key:
+            print(f"Polling for result object s3://{bucket}/{expected_key}")
+
+        while True:
+            try:
+                # First, check expected result key (InferenceId.out)
+                s3_client.head_object(Bucket=bucket, Key=expected_key)
+                break
+            except Exception:
+                if time.time() - start_time > timeout_seconds:
+                    print(f"Timed out waiting for result {i + 1}/{len(output_locations)} after {timeout_seconds}s")
+                    results.append(None)
+                    break
+                elapsed = int(time.time() - start_time)
+                print(f"Waiting for result {i + 1}/{len(output_locations)}... {elapsed}s elapsed")
+
+                # Try to detect async failure artifact: async-endpoint-failures/.../<InferenceId>-error.out
+                if isinstance(inference_id, str) and inference_id:
+                    try:
+                        candidates = s3_client.list_objects_v2(
+                            Bucket=bucket,
+                            Prefix="async-endpoint-failures/",
+                            MaxKeys=1000,
+                        )
+                        for obj in candidates.get("Contents", []):
+                            k = obj.get("Key", "")
+                            if k.endswith(f"{inference_id}-error.out"):
+                                err_obj = s3_client.get_object(Bucket=bucket, Key=k)
+                                err_text = err_obj["Body"].read().decode("utf-8", errors="replace")
+                                print(f"Error for request {i + 1}/{len(output_locations)} (InferenceId={inference_id}):\n{err_text}")
+                                results.append(None)
+                                # Stop waiting for this one
+                                elapsed = int(time.time() - start_time)
+                                print(f"Marking request {i + 1} as failed after {elapsed}s due to async failure artifact: s3://{bucket}/{k}")
+                                # Break out of the polling loop
+                                raise StopIteration
+                    except StopIteration:
+                        break
+                    except Exception:
+                        # Ignore listing errors silently and keep polling
+                        pass
+                time.sleep(poll_interval_seconds)
+
+        if len(results) == 0 or results[-1] is not None:
+            obj = s3_client.get_object(Bucket=bucket, Key=key)
+            result_body = obj["Body"].read().decode("utf-8")
+            results.append(result_body)
+            total = int(time.time() - start_time)
+            print(f"Result ready for {i + 1}/{len(output_locations)} after {total}s")
+
+    return results
diff --git a/src/wraval/actions/format.py b/src/wraval/actions/format.py
index 25056db..f9bc374 100644
--- a/src/wraval/actions/format.py
+++ b/src/wraval/actions/format.py
@@ -18,7 +18,7 @@ def format_prompt(usr_prompt, prompt=None, tokenizer=None, type="bedrock", think
 
     if type == "hf":
         if prompt:
-            if thinking is None or True:
+            if thinking is None or thinking is True:
                 sys_prompt = [{"role": "system", "content": prompt.sys_prompt}]
             else:
                 sys_prompt = [{"role": "system", "content": prompt.sys_prompt + '/no_think'}]
diff --git a/src/wraval/actions/model_router.py b/src/wraval/actions/model_router.py
index a404be4..b05c1f0 100644
--- a/src/wraval/actions/model_router.py
+++ b/src/wraval/actions/model_router.py
@@ -2,6 +2,7 @@
     batch_get_bedrock_completions,
     invoke_sagemaker_endpoint,
     invoke_ollama_endpoint,
+    batch_invoke_sagemaker_endpoint,
 )
 from .format import format_prompt
 from transformers import AutoTokenizer
@@ -46,18 +47,29 @@ def __init__(self, master_sys_prompt, settings):
         super().__init__(master_sys_prompt, settings)
         self.model_name = settings.model
         self.region = settings.region
+        self.thinking = None
         if settings.exists('thinking'):
             self.thinking = settings.thinking
-        else:
-            self.thinking = None
+        self.async_config = False
+        if settings.exists('asynchronous'):
+            self.async_config = settings.asynchronous
+        self.deploy_bucket_name = settings.deploy_bucket_name
 
     def get_completion(self, queries: List[str]) -> List[str]:
         prompts = [
             format_prompt(text, self.master_sys_prompt, self.tokenizer, "hf", self.thinking)
             for text in queries
         ]
+        if self.async_config:
+            return batch_invoke_sagemaker_endpoint(prompts, 
+                                                   self.model_name, 
+                                                   self.region, 
+                                                   self.deploy_bucket_name)
         return [
-            invoke_sagemaker_endpoint({"inputs": prompt}, self.model_name, self.region) for prompt in tqdm(prompts)
+            invoke_sagemaker_endpoint({"inputs": prompt}, 
+                                      self.model_name, 
+                                      self.region) 
+                                      for prompt in tqdm(prompts)
         ]
 
 

From 6d046ce9cbeb41f96bcb660797c8cd58acbf1508 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Mon, 25 Aug 2025 16:01:31 +0200
Subject: [PATCH 17/24] add qwen 0.6B

---
 config/settings.toml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/config/settings.toml b/config/settings.toml
index c2802fd..32b6a6f 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -39,6 +39,12 @@ model = 'Phi-3-5-mini-instruct'
 hf_name = 'microsoft/Phi-3.5-mini-instruct'
 endpoint_type = 'sagemaker'
 
+[qwen-3-0-6B]
+model = 'Qwen3-0-6B'
+hf_name = 'Qwen/Qwen3-0-6B' # instruct is now this, and base is appended with 'base'
+endpoint_type = 'sagemaker'
+thinking = false
+
 [qwen-2-5-1-5B]
 model = 'Qwen2-5-1-5B-Instruct'
 hf_name = 'Qwen/Qwen2.5-1.5B-Instruct'

From d5385a2da2d8fc59ce07d7b4e44ec11f640da76a Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Mon, 25 Aug 2025 16:03:53 +0200
Subject: [PATCH 18/24] typpo in qwen hf name

---
 config/settings.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/settings.toml b/config/settings.toml
index 32b6a6f..594fd24 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -41,7 +41,7 @@ endpoint_type = 'sagemaker'
 
 [qwen-3-0-6B]
 model = 'Qwen3-0-6B'
-hf_name = 'Qwen/Qwen3-0-6B' # instruct is now this, and base is appended with 'base'
+hf_name = 'Qwen/Qwen3-0.6B' # instruct is now this, and base is appended with 'base'
 endpoint_type = 'sagemaker'
 thinking = false
 

From 21c489c5ad5f342445e7bf3d3d6175f98444e530 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Tue, 25 Nov 2025 10:04:58 +0100
Subject: [PATCH 19/24] try to resolve pyarrow dep

---
 .kiro/steering/product.md   |  39 ++++++++++++
 .kiro/steering/structure.md | 123 ++++++++++++++++++++++++++++++++++++
 .kiro/steering/tech.md      | 107 +++++++++++++++++++++++++++++++
 pyproject.toml              |   3 +-
 4 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 .kiro/steering/product.md
 create mode 100644 .kiro/steering/structure.md
 create mode 100644 .kiro/steering/tech.md

diff --git a/.kiro/steering/product.md b/.kiro/steering/product.md
new file mode 100644
index 0000000..990bb16
--- /dev/null
+++ b/.kiro/steering/product.md
@@ -0,0 +1,39 @@
+# Product Overview
+
+## WRAVAL – WRiting Assist eVALuation
+
+WRAVAL is an evaluation framework for assessing Large Language Models (LLMs) and Small Language Models (SLMs) on writing assistant tasks. It focuses on non-reasoning tasks like tone transformation, summarization, and text improvement.
+
+### Purpose
+
+The framework addresses a gap in LM evaluation by focusing on practical writing assistant use cases rather than general reasoning tasks. It demonstrates that SLMs (under 10B parameters) can perform competitively on specific writing tasks despite scoring lower on general intelligence benchmarks.
+
+### Core Capabilities
+
+1. **Data Generation**: Synthetic dataset creation for various writing tasks using LLMs
+2. **Inference**: Running writing assistant tasks on both Bedrock-hosted and self-hosted models
+3. **Evaluation**: LLM-as-a-judge and human evaluation workflows
+4. **Deployment**: SageMaker endpoint deployment for custom models
+
+### Supported Writing Tasks (Tones)
+
+- **witty**: Transform factual sentences to witty versions
+- **professional**: Convert casual text to professional tone
+- **casual**: Make formal text more casual
+- **elaborate**: Expand simple sentences with detail
+- **shorten**: Condense wordy text
+- **improve**: Enhance poorly written sentences
+- **keypoints**: Extract key points from paragraphs
+- **proofread**: Correct errors in text
+- **emojify**: Add emojis to plain text
+- **summarize**: Create paragraph summaries
+
+### Target Users
+
+- ML practitioners evaluating SLMs for edge/private computing
+- Researchers benchmarking models on specific writing tasks
+- Teams implementing writing assistant features
+
+### Key Innovation
+
+The framework enables evaluation of models on tasks they excel at, rather than forcing comparison on general reasoning benchmarks where SLMs underperform.
diff --git a/.kiro/steering/structure.md b/.kiro/steering/structure.md
new file mode 100644
index 0000000..fc22366
--- /dev/null
+++ b/.kiro/steering/structure.md
@@ -0,0 +1,123 @@
+# Project Structure
+
+## Directory Layout
+
+```
+wraval/
+├── config/
+│   └── settings.toml              # Model and AWS configuration
+├── data/                          # Generated datasets (timestamped CSVs)
+│   ├── clean/                     # Cleaned/processed datasets
+│   ├── old/                       # Archived datasets
+│   └── unique_queries/            # Deduplicated queries
+├── src/wraval/                    # Main package source
+│   ├── __init__.py
+│   ├── main.py                    # CLI entry point (Typer app)
+│   ├── aws_config.py              # AWS configuration and warning suppression
+│   ├── testing.py                 # Testing utilities
+│   ├── actions/                   # Core action modules
+│   │   ├── action_generate.py    # Data generation logic
+│   │   ├── action_inference.py   # Model inference execution
+│   │   ├── action_llm_judge.py   # LLM-as-a-judge evaluation
+│   │   ├── action_deploy.py      # SageMaker deployment
+│   │   ├── action_results.py     # Results visualization
+│   │   ├── action_examples.py    # Example display
+│   │   ├── action_human_judge_upload.py  # Human eval setup
+│   │   ├── action_human_judge_parsing.py # Human eval parsing
+│   │   ├── aws_utils.py          # AWS helper functions
+│   │   ├── completion.py         # Model completion wrappers
+│   │   ├── data_utils.py         # Data manipulation utilities
+│   │   ├── format.py             # Prompt formatting
+│   │   ├── model_router.py       # Model endpoint routing
+│   │   ├── prompt_tones.py       # Tone definitions and prompts
+│   │   ├── prompts_judge.py      # Judge evaluation prompts
+│   │   ├── data_generation_prompts.py  # Data gen prompts
+│   │   ├── read_random_lines.py  # Sampling utilities
+│   │   ├── cloudformation.yml    # CloudFormation templates
+│   │   ├── cloudformation_BedrockBatchInference.yml
+│   │   └── groundtruth_eval_template.html  # Human eval UI
+│   ├── custom_prompts/           # Custom prompt templates
+│   │   ├── data_generation_prompts.py
+│   │   ├── prompt_tones.py
+│   │   ├── prompts_judge.py
+│   │   ├── tone_prompts.py
+│   │   └── s3_transfer.sh        # S3 sync script
+│   └── model_artifacts/          # SageMaker deployment artifacts
+│       └── code/
+│           ├── inference.py      # SageMaker inference handler
+│           └── requirements.txt  # Model deployment deps
+├── resources/                     # Documentation and presentations
+├── build/                         # Build artifacts
+├── .ipynb_checkpoints/           # Jupyter notebook checkpoints
+├── pyproject.toml                # Package configuration
+├── setup.py                      # Setup script
+├── requirements.txt              # Pinned dependencies
+├── LICENSE-2.0.txt               # Apache 2.0 license
+├── NOTICE.txt                    # Copyright notice
+└── README.md                     # Project documentation
+```
+
+## Module Organization
+
+### Entry Point
+- **main.py**: CLI application using Typer with commands for each workflow step
+
+### Actions Module (`src/wraval/actions/`)
+Core functionality organized by workflow step:
+- **Generation**: `action_generate.py` - Creates synthetic datasets
+- **Inference**: `action_inference.py` - Runs models on datasets
+- **Evaluation**: `action_llm_judge.py` - Automated evaluation
+- **Deployment**: `action_deploy.py` - SageMaker endpoint management
+- **Human Eval**: `action_human_judge_*.py` - Human evaluation workflows
+- **Utilities**: Supporting modules for AWS, data, prompts, formatting
+
+### Custom Prompts (`src/wraval/custom_prompts/`)
+User-customizable prompt templates that override defaults when `--custom-prompts` flag is used.
+
+### Model Artifacts (`src/wraval/model_artifacts/`)
+SageMaker-specific deployment code:
+- `inference.py`: Custom inference handler for deployed models
+- `requirements.txt`: Runtime dependencies for deployed models
+
+## Configuration Files
+
+### settings.toml
+Environment-based configuration with model profiles:
+- `[default]`: Base settings (region, buckets, roles)
+- `[model-name]`: Model-specific configs (endpoint type, HF model name)
+- Supports string interpolation for AWS account/region
+
+### pyproject.toml
+Package metadata and dependencies:
+- Main dependencies in `dependencies` array
+- Optional GPU dependencies in `[project.optional-dependencies]`
+- Entry point: `wraval` command → `wraval.main:main`
+
+## Data Flow
+
+1. **Generation**: `wraval generate` → `data/all-{timestamp}.csv`
+2. **Inference**: Reads latest CSV → adds model outputs → saves updated CSV
+3. **Evaluation**: Reads CSV with outputs → adds judge scores → saves updated CSV
+4. **Human Eval**: Samples from CSV → uploads to S3 → creates SageMaker Ground Truth job
+
+## File Naming Conventions
+
+- **Datasets**: `all-{YYYYMMDD_HHMMSS}.csv` (timestamped)
+- **Actions**: `action_{verb}.py` (e.g., `action_generate.py`)
+- **Utilities**: `{noun}_utils.py` (e.g., `aws_utils.py`, `data_utils.py`)
+- **Prompts**: `{type}_prompts.py` or `prompt_{type}.py`
+
+## Import Patterns
+
+- Actions import from sibling modules: `from wraval.actions.{module} import {function}`
+- Main imports actions: `from wraval.actions.action_{name} import {function}`
+- Config loaded via dynaconf: `Dynaconf(settings_files=[...])`
+- AWS config imported first to suppress warnings: `from wraval.aws_config import *`
+
+## Key Architectural Patterns
+
+1. **CLI-driven**: All functionality exposed through Typer commands
+2. **Configuration-based**: Model behavior controlled via settings.toml profiles
+3. **Stateless actions**: Each action reads/writes CSV files independently
+4. **Pluggable prompts**: Custom prompts override defaults when specified
+5. **Multi-endpoint**: Unified interface for Bedrock, SageMaker, Ollama
diff --git a/.kiro/steering/tech.md b/.kiro/steering/tech.md
new file mode 100644
index 0000000..c9109e4
--- /dev/null
+++ b/.kiro/steering/tech.md
@@ -0,0 +1,107 @@
+# Technology Stack
+
+## Build System & Package Management
+
+- **Package Manager**: `uv` (modern Python package manager)
+- **Build System**: setuptools with pyproject.toml
+- **Python Version**: >=3.9
+
+## Core Dependencies
+
+### ML & AI Frameworks
+- **transformers** (4.48.1): HuggingFace transformers for model loading
+- **torch** (2.6.0): PyTorch for model inference
+- **accelerate**: Distributed training and inference
+- **bitsandbytes**: Quantization support (GPU optional dependency)
+
+### AWS Integration
+- **boto3**: AWS SDK for Python
+- **sagemaker** (2.236.0): SageMaker model deployment
+- **bedrock-runtime**: Bedrock model inference
+
+### Data & Utilities
+- **pandas** (2.2.3): Data manipulation
+- **datasets** (3.2.0): HuggingFace datasets
+- **dynaconf** (3.2.7): Configuration management
+- **typer**: CLI framework
+- **plotly** (5.24.1): Visualization
+- **beautifulsoup4**: HTML parsing
+
+## Configuration Management
+
+Configuration is managed via `dynaconf` with environment-based settings in `config/settings.toml`:
+- Model configurations (Bedrock, SageMaker, Ollama endpoints)
+- AWS region and account settings
+- S3 bucket paths for data and models
+- Endpoint types and model mappings
+
+## Common Commands
+
+### Installation
+```bash
+# Standard installation
+uv pip install .
+
+# With GPU support (requires CUDA)
+uv pip install ".[gpu]"
+```
+
+### CLI Commands
+```bash
+# Generate evaluation data
+wraval generate --model haiku-3 --type witty
+
+# Run inference on generated data
+wraval inference --model nova-lite --type all
+
+# Evaluate with LLM-as-a-judge
+wraval llm_judge --model haiku-3 --type professional
+
+# Deploy model to SageMaker
+wraval deploy-model --model phi-3-5-4B
+
+# Show examples from dataset
+wraval show-examples --model haiku-3 --type witty --n-examples 10
+
+# Upload for human evaluation
+wraval human-judge-upload --type all --n-samples 100
+
+# View results
+wraval show-results --type all
+```
+
+### Common Options
+- `--model, -m`: Model identifier from settings.toml
+- `--type, -t`: Tone type (witty, professional, casual, etc. or 'all')
+- `--upload-s3`: Upload results to S3
+- `--custom-prompts`: Use custom prompt templates
+- `--local-tokenizer-path`: Path to local tokenizer
+
+## Project Structure
+
+Entry point: `src/wraval/main.py` (CLI using Typer)
+
+Key modules:
+- `actions/`: Core functionality (generate, inference, judge, deploy)
+- `custom_prompts/`: Prompt templates for different tones
+- `model_artifacts/`: SageMaker deployment artifacts
+- `config/settings.toml`: Model and AWS configuration
+
+## Data Storage
+
+- **Local**: `./data/` directory with timestamped CSV files
+- **S3**: Configurable bucket paths for datasets and human evaluation
+- **Format**: CSV files with columns for input, output, model, tone, timestamps
+
+## Endpoint Types
+
+1. **bedrock**: AWS Bedrock hosted models (Claude, Nova)
+2. **sagemaker**: Self-hosted models on SageMaker endpoints
+3. **ollama**: Local Ollama endpoints (for development)
+
+## Development Notes
+
+- AWS credentials required for Bedrock/SageMaker operations
+- GPU support needed for model deployment (`bitsandbytes` dependency)
+- Configuration uses string formatting for AWS account/region injection
+- All CLI commands support `--help` for detailed usage
diff --git a/pyproject.toml b/pyproject.toml
index 2c9593f..d3f863f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,8 @@ dependencies = [
     "requests",
     "accelerate",
     "torchvision",
-    "typer"
+    "typer",
+    "pyarrow>=14.0.0,<23.0.0"
 ]
 
 [project.scripts]

From dba6663ddf606b7e1c39f180c424a7bdb1c4f97f Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Tue, 25 Nov 2025 10:06:45 +0100
Subject: [PATCH 20/24] remove pyarrow

---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d3f863f..2c9593f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,8 +27,7 @@ dependencies = [
     "requests",
     "accelerate",
     "torchvision",
-    "typer",
-    "pyarrow>=14.0.0,<23.0.0"
+    "typer"
 ]
 
 [project.scripts]

From dd6539c44a16bff307f82fe79d013dad2f497392 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Tue, 25 Nov 2025 10:30:34 +0100
Subject: [PATCH 21/24] small change in config

---
 config/settings.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/settings.toml b/config/settings.toml
index 594fd24..de4ac7a 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -65,7 +65,7 @@ asynchronous = true
 
 [qwen-3-4B]
 model = 'Qwen3-4B'
-hf_name = 'Qwen/Qwen3-4B-Instruct-2507' # this finetune is non-thinking only
+hf_name = 'Qwen/Qwen3-4B-Instruct-2507'
 endpoint_type = 'sagemaker'
 
 [phi-3-ollama]

From a39bfda41fa50cd79078add42c6373e9126b3694 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Tue, 25 Nov 2025 11:09:36 +0100
Subject: [PATCH 22/24] detect phi/qwen and apply Kai dynaconf fix for linux

---
 src/wraval/actions/completion.py | 41 ++++++++++++++++++++++++++------
 src/wraval/main.py               |  1 +
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/src/wraval/actions/completion.py b/src/wraval/actions/completion.py
index e2699b0..00cbc21 100644
--- a/src/wraval/actions/completion.py
+++ b/src/wraval/actions/completion.py
@@ -15,14 +15,40 @@
 
 
 # Function to extract last assistant response from each entry
-def extract_last_assistant_response(data):
-
-    if r"<\|assistant\|>" in data: # phi
+def extract_last_assistant_response(data, model_name=None):
+    """
+    Extract the assistant's response from model output.
+    
+    Args:
+        data: Raw model output string
+        model_name: Model identifier (e.g., 'Phi-3-5-mini-instruct', 'Qwen3-4B')
+                   If None, attempts to detect format from data
+    
+    Returns:
+        Cleaned assistant response string
+    """
+    # Determine model type from model_name if provided
+    if model_name:
+        model_lower = model_name.lower()
+        is_phi = 'phi' in model_lower
+        is_qwen = 'qwen' in model_lower
+    else:
+        # Fallback to pattern detection if model_name not provided
+        is_phi = r"<\|assistant\|>" in data
+        is_qwen = r"<|im_start|>assistant" in data
+    
+    # Handle Phi models
+    if is_phi:
+        if r"<\|assistant\|>" not in data:
+            return data
         assistant_part = data.split(r"<\|assistant\|>")[-1]
-        response = response.replace(r"<\|end\|>", "").strip()
+        response = assistant_part.replace(r"<\|end\|>", "").strip()
         return response
-        
-    if r"<|im_start|>assistant" in data: # qwen
+    
+    # Handle Qwen models
+    if is_qwen:
+        if r"<|im_start|>assistant" not in data:
+            return data
         assistant_part = data.split(r"<|im_start|>assistant")[-1]
         
         # Remove the thinking part if it exists
@@ -34,6 +60,7 @@ def extract_last_assistant_response(data):
         response = response.replace(r"<|im_end|>", "").strip()
         return response
     
+    # Return data as-is if no known format detected
     return data
 
 def get_bedrock_completion(settings, prompt, system_prompt=None):
@@ -211,7 +238,7 @@ def invoke_sagemaker_endpoint(
         )
         json_output = response["Body"].readlines()
         plain_output = "\n".join(json.loads(json_output[0]))
-        last_assistant = extract_last_assistant_response(plain_output)
+        last_assistant = extract_last_assistant_response(plain_output, model_name=endpoint_name)
         print("Test response:", last_assistant)
         return last_assistant
     except Exception as e:
diff --git a/src/wraval/main.py b/src/wraval/main.py
index 3a81d49..1b84694 100644
--- a/src/wraval/main.py
+++ b/src/wraval/main.py
@@ -54,6 +54,7 @@ def get_settings(
         ],
         env=f"default,{model}",
         environments=True,
+        case_sensitive=False,
     )
 
     if settings.endpoint_type in ("bedrock", "sagemaker"):

From 7653f5c1e079bbe6c273053453f11cc15dc3293a Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Tue, 25 Nov 2025 12:13:39 +0100
Subject: [PATCH 23/24] qwen3-4b async

---
 config/settings.toml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/config/settings.toml b/config/settings.toml
index de4ac7a..bb74ebc 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -68,6 +68,13 @@ model = 'Qwen3-4B'
 hf_name = 'Qwen/Qwen3-4B-Instruct-2507'
 endpoint_type = 'sagemaker'
 
+[qwen-3-4B-async]
+model = 'Qwen3-4B'
+hf_name = 'Qwen/Qwen3-4B-Instruct-2507'
+endpoint_type = 'sagemaker'
+thinking = false
+asynchronous = true
+
 [phi-3-ollama]
 model = 'phi3'
 hf_name = 'microsoft/Phi-3.5-mini-instruct'

From bc0322a257088f2f6795fd86fbb354f680260b62 Mon Sep 17 00:00:00 2001
From: Gabriel Benedict <gbndict@gmail.com>
Date: Tue, 25 Nov 2025 12:17:25 +0100
Subject: [PATCH 24/24] qwen3-4b async typo

---
 config/settings.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/settings.toml b/config/settings.toml
index bb74ebc..c8a42cb 100644
--- a/config/settings.toml
+++ b/config/settings.toml
@@ -69,7 +69,7 @@ hf_name = 'Qwen/Qwen3-4B-Instruct-2507'
 endpoint_type = 'sagemaker'
 
 [qwen-3-4B-async]
-model = 'Qwen3-4B'
+model = 'Qwen3-4B-Async'
 hf_name = 'Qwen/Qwen3-4B-Instruct-2507'
 endpoint_type = 'sagemaker'
 thinking = false