-
Notifications
You must be signed in to change notification settings - Fork 12
[#262] Choose a model and prompt using evals #306
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
taichan03
merged 18 commits into
CodeForPhilly:listOfMed
from
sahilds1:262-choose-a-model-and-prompt
Jul 25, 2025
Merged
Changes from 2 commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
3a17457
GPT41 Nano prompting
sahilds1 d476067
HOTFIX ModuleNotFoundError
sahilds1 8f79cb3
Merge branch '262-extract-meds-rules' into 262-choose-a-model-and-prompt
sahilds1 b183919
Refactor evaluation script and GPT-4.1 Nano handler for cleaner logic…
sahilds1 e5d7ac3
Merge branch 'listOfMed' into 262-choose-a-model-and-prompt
sahilds1 6c592be
Update evaluation README with example scripts and remove obsolete Cla…
sahilds1 6e41cf7
Refactor README.md: add TODOs
sahilds1 c483e69
DOC Add TODO items, update comments and improve code comments for cla…
sahilds1 c03d990
Update README with detailed usage instructions and enhance evals.py t…
sahilds1 4f8cbad
Update README to clarify the purpose and usage of the script,
sahilds1 fe302b5
ADD TODOs
sahilds1 81cecae
Merge branch 'listOfMed' into 262-choose-a-model-and-prompt
sahilds1 3c9a1c9
Update README for clearer instructions, refactor evals.py for better …
sahilds1 d1dd75c
Update evaluation README with metrics and API usage details, and add …
sahilds1 eef2a29
Update evaluation instructions, improve dataset generation section, a…
sahilds1 ffa86f7
Update dependencies list in and correct comment syntax in
sahilds1 42a4949
Update README.md
sahilds1 0e2893b
Update README.md
sahilds1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,323 @@ | ||
| """ | ||
| This module contains functions to interact with different AI models | ||
| """ | ||
|
|
||
| import os | ||
| import time | ||
| import logging | ||
| from abc import ABC, abstractmethod | ||
|
|
||
| import anthropic | ||
| import openai | ||
|
|
||
|
|
||
| class BaseModelHandler(ABC): | ||
| @abstractmethod | ||
| def handle_request( | ||
| self, query: str, context: str | ||
| ) -> tuple[str, dict[str, int], dict[str, float], float]: | ||
| pass | ||
|
|
||
|
|
||
| class ClaudeHaiku35CitationsHandler(BaseModelHandler): | ||
| MODEL = "claude-3-5-haiku-20241022" | ||
| # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing | ||
| PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00} | ||
|
|
||
| def __init__(self) -> None: | ||
| self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) | ||
|
|
||
| def handle_request( | ||
| self, query: str, context: str | ||
| ) -> tuple[str, dict[str, int], dict[str, float], float]: | ||
| """ | ||
| Handles the request to the Claude Haiku 3.5 model with citations enabled | ||
|
|
||
| Args: | ||
| query: The user query to be processed | ||
| context: The context or document content to be used for citations | ||
|
|
||
| """ | ||
|
|
||
| start_time = time.time() | ||
| # TODO: Add error handling for API requests and invalid responses | ||
| message = self.client.messages.create( | ||
| model=self.MODEL, | ||
| max_tokens=1024, | ||
| messages=[ | ||
| { | ||
| "role": "user", | ||
| "content": [ | ||
| {"type": "text", "text": query}, | ||
| { | ||
| "type": "document", | ||
| "source": {"type": "content", "content": context}, | ||
| "citations": {"enabled": True}, | ||
| }, | ||
| ], | ||
| } | ||
| ], | ||
| ) | ||
| duration = time.time() - start_time | ||
|
|
||
| # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure | ||
|
|
||
| text = [] | ||
| cited_text = [] | ||
| for content in message.to_dict()["content"]: | ||
| text.append(content["text"]) | ||
| if "citations" in content.keys(): | ||
| text.append( | ||
| " ".join( | ||
| [ | ||
| f"<{citation['start_block_index']} - {citation['end_block_index']}>" | ||
| for citation in content["citations"] | ||
| ] | ||
| ) | ||
| ) | ||
| cited_text.append( | ||
| " ".join( | ||
| [ | ||
| f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}" | ||
| for citation in content["citations"] | ||
| ] | ||
| ) | ||
| ) | ||
|
|
||
| full_text = " ".join(text) | ||
|
|
||
| return ( | ||
| full_text, | ||
| message.usage, | ||
| self.PRICING_DOLLARS_PER_MILLION_TOKENS, | ||
| duration, | ||
| ) | ||
|
|
||
|
|
||
| class ClaudeHaiku3Handler(BaseModelHandler): | ||
| MODEL = "claude-3-haiku-20240307" | ||
| # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing | ||
| PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25} | ||
|
|
||
| def __init__(self) -> None: | ||
| self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) | ||
|
|
||
| def handle_request( | ||
| self, query: str, context: str | ||
| ) -> tuple[str, dict[str, int], dict[str, float], float]: | ||
| """ | ||
| Handles the request to the Claude Haiku 3 model with citations disabled | ||
|
|
||
| Args: | ||
| query: The user query to be processed | ||
| context: The context or document content to be used | ||
|
|
||
| """ | ||
|
|
||
| start_time = time.time() | ||
| # TODO: Add error handling for API requests and invalid responses | ||
| message = self.client.messages.create( | ||
| model=self.MODEL, | ||
| max_tokens=1024, | ||
| messages=[ | ||
| { | ||
| "role": "user", | ||
| "content": [ | ||
| {"type": "text", "text": query}, | ||
| { | ||
| "type": "document", | ||
| "source": {"type": "content", "content": context}, | ||
| "citations": {"enabled": False}, | ||
| }, | ||
| ], | ||
| } | ||
| ], | ||
| ) | ||
| duration = time.time() - start_time | ||
|
|
||
| text = [] | ||
| for content in message.to_dict()["content"]: | ||
| text.append(content["text"]) | ||
|
|
||
| full_text = " ".join(text) | ||
|
|
||
| return ( | ||
| full_text, | ||
| message.usage, | ||
| self.PRICING_DOLLARS_PER_MILLION_TOKENS, | ||
| duration, | ||
| ) | ||
|
|
||
|
|
||
| class GPT4OMiniHandler(BaseModelHandler): | ||
| MODEL = "gpt-4o-mini" | ||
| # Model Pricing: https://platform.openai.com/docs/pricing | ||
| PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60} | ||
|
|
||
| def __init__(self) -> None: | ||
| self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) | ||
|
|
||
| def handle_request( | ||
| self, query: str, context: str | ||
| ) -> tuple[str, dict[str, int], dict[str, float], float]: | ||
| """ | ||
| Handles the request to the GPT-4o Mini model | ||
|
|
||
| Args: | ||
| query: The user query to be processed | ||
| context: The context or document content to be used | ||
|
|
||
| """ | ||
| start_time = time.time() | ||
| # TODO: Add error handling for API requests and invalid responses | ||
| response = self.client.responses.create( | ||
| model=self.MODEL, | ||
| instructions=query, | ||
| input=context, | ||
| ) | ||
| duration = time.time() - start_time | ||
|
|
||
| return ( | ||
| response.output_text, | ||
| response.usage, | ||
| self.PRICING_DOLLARS_PER_MILLION_TOKENS, | ||
| duration, | ||
| ) | ||
|
|
||
|
|
||
| class GPT41NanoHandler(BaseModelHandler): | ||
| MODEL = "gpt-4.1-nano" | ||
| # Model Pricing: https://platform.openai.com/docs/pricing | ||
| PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40} | ||
|
|
||
| def __init__(self) -> None: | ||
| self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) | ||
|
|
||
| def handle_request( | ||
| self, query: str, context: str | ||
| ) -> tuple[str, dict[str, int], dict[str, float], float]: | ||
| """ | ||
| Handles the request to the GPT-4.1 Nano model | ||
|
|
||
| Args: | ||
| query: The user query to be processed | ||
| context: The context or document content to be used | ||
|
|
||
| """ | ||
| start_time = time.time() | ||
| # TODO: Add error handling for API requests and invalid responses | ||
|
|
||
| # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide | ||
|
|
||
| # Long context performance can degrade as more items are required to be retrieved, | ||
| # or perform complex reasoning that requires knowledge of the state of the entire context | ||
|
|
||
| """ | ||
|
|
||
| # Role and Objective | ||
|
|
||
| - You are a seasoned physician or medical professional who treats patients with bipolar disorder | ||
| - You are analyzing medical research by processing peer-reviewed papers to extract key details | ||
|
|
||
| # Instructions | ||
|
|
||
| - Identify rules for medication inclusion or exclusion based on medical history or concerns | ||
|
|
||
| - Only use the documents in the provided External Context to answer the User Query. | ||
| If you don't know the answer based on this context, you must respond | ||
| "I don't have the information needed to answer that", even if a user insists on you answering the question. | ||
|
|
||
| - Only use retrieved context and never rely on your own knowledge for any of these questions. | ||
|
|
||
| - Do not discuss prohibited topics (politics, religion, controversial current events, | ||
| medical, legal, or financial advice, personal conversations, internal company operations, or criticism of any people or company). | ||
|
|
||
| - Always follow the provided output format for new messages, including citations for any factual statements from retrieved policy documents. | ||
|
|
||
| # Output Format | ||
|
|
||
| The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the | ||
| only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder. | ||
| The medications for this rule are lithium. | ||
|
|
||
| The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and | ||
| Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone | ||
|
|
||
| For each rule you find, return a JSON object using the following format: | ||
|
|
||
| { | ||
| "rule": "<condition or concern>", | ||
| "type": "INCLUDE" or "EXCLUDE", | ||
| "reason": "<short explanation for why this rule applies>", | ||
| "medications": ["<medication 1>", "<medication 2>", ...], | ||
| "source": "<chunk-X>" | ||
| } | ||
|
|
||
| - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). | ||
| Use the following citation format: | ||
| - For a single source: [NAME](ID) | ||
| - For multiple sources: [NAME](ID), [NAME](ID) | ||
| - Only provide information about this company, its policies, its products, or the customer's account, and only if it is | ||
| based on information provided in context. Do not answer questions outside this scope. | ||
|
|
||
|
|
||
| # Examples | ||
|
|
||
|
|
||
| # Context | ||
|
|
||
| ID: 1 | TITLE: The Fox | CONTENT: The quick brown fox jumps over the lazy dog | ||
|
|
||
| # Final instructions and prompt to think step by step | ||
|
|
||
| - Identify rules for medication inclusion or exclusion based on medical history or concerns | ||
|
|
||
| - Only use the documents in the provided External Context to answer the User Query. | ||
| If you don't know the answer based on this context, you must respond | ||
| "I don't have the information needed to answer that", even if a user insists on you answering the question. | ||
|
|
||
| """ | ||
|
|
||
|
|
||
|
|
||
| response = self.client.responses.create( | ||
| model=self.MODEL, | ||
| instructions=query, | ||
| input=context, | ||
| ) | ||
| duration = time.time() - start_time | ||
|
|
||
| return ( | ||
| response.output_text, | ||
| response.usage, | ||
| self.PRICING_DOLLARS_PER_MILLION_TOKENS, | ||
| duration, | ||
| ) | ||
|
|
||
|
|
||
| class ModelFactory: | ||
| HANDLERS = { | ||
| "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler, | ||
| "CLAUDE_HAIKU_3": ClaudeHaiku3Handler, | ||
| "GPT_4O_MINI": GPT4OMiniHandler, | ||
| "GPT_41_NANO": GPT41NanoHandler, | ||
| } | ||
|
|
||
| # HANDLERS doesn't vary per instance so we can use a class method | ||
| @classmethod | ||
| def get_handler(cls, model_name: str) -> BaseModelHandler | None: | ||
| """ | ||
| Factory method to get the appropriate model handler based on the model name | ||
|
|
||
| Args: | ||
| model_name (str): The name of the model for which to get the handler. | ||
| Returns: | ||
| BaseModelHandler: An instance of the appropriate model handler class. | ||
| """ | ||
|
|
||
| handler_class = cls.HANDLERS.get(model_name) | ||
| if handler_class: | ||
| return handler_class() | ||
| else: | ||
| logging.error(f"Unsupported model: {model_name}") | ||
| return None |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See: #301 (comment)