Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions coolprompt/assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from coolprompt.task_detector.detector import TaskDetector
from coolprompt.data_generator.generator import SyntheticDataGenerator
from coolprompt.language_model.llm import DefaultLLM
from coolprompt.optimizer.hype import hype_optimizer
from coolprompt.optimizer.hype import HyPEOptimizer, HyPEROptimizer
from coolprompt.optimizer.reflective_prompt import reflectiveprompt
from coolprompt.optimizer.distill_prompt.run import distillprompt
from coolprompt.utils.logging_config import logger, set_verbose, setup_logging
Expand All @@ -23,10 +23,6 @@
CLASSIFICATION_TASK_TEMPLATE,
GENERATION_TASK_TEMPLATE,
)
from coolprompt.utils.prompt_templates.hype_templates import (
CLASSIFICATION_TASK_TEMPLATE_HYPE,
GENERATION_TASK_TEMPLATE_HYPE,
)
from coolprompt.utils.correction.corrector import correct
from coolprompt.utils.correction.rule import LanguageRule
from coolprompt.prompt_assistant.prompt_assistant import PromptAssistant
Expand All @@ -36,12 +32,8 @@ class PromptTuner:
"""Prompt optimization tool supporting multiple methods."""

TEMPLATE_MAP = {
(Task.CLASSIFICATION, Method.HYPE): CLASSIFICATION_TASK_TEMPLATE_HYPE,
(Task.CLASSIFICATION, Method.REFLECTIVE): CLASSIFICATION_TASK_TEMPLATE,
(Task.CLASSIFICATION, Method.DISTILL): CLASSIFICATION_TASK_TEMPLATE,
(Task.GENERATION, Method.HYPE): GENERATION_TASK_TEMPLATE_HYPE,
(Task.GENERATION, Method.REFLECTIVE): GENERATION_TASK_TEMPLATE,
(Task.GENERATION, Method.DISTILL): GENERATION_TASK_TEMPLATE,
Task.CLASSIFICATION: CLASSIFICATION_TASK_TEMPLATE,
Task.GENERATION: GENERATION_TASK_TEMPLATE,
}

def __init__(
Expand Down Expand Up @@ -102,7 +94,7 @@ def get_task_prompt_template(self, task: str, method: str) -> str:
The type of task, either "classification" or "generation".
method (str):
Optimization method to use.
Available methods are: ['hype', 'reflective', 'distill']
Available methods are: ['hype', 'reflective', 'distill', 'hyper']

Returns:
str: The prompt template for the given task.
Expand All @@ -113,7 +105,7 @@ def get_task_prompt_template(self, task: str, method: str) -> str:
)
task = validate_task(task)
method = validate_method(method)
return self.TEMPLATE_MAP[(task, method)]
return self.TEMPLATE_MAP[task]

def _get_dataset_split(
self,
Expand Down Expand Up @@ -182,7 +174,7 @@ def run(
target (Iterable):
Target iterable object for autoprompting optimization.
method (str): Optimization method to use.
Available methods are: ['hype', 'reflective', 'distill']
Available methods are: ['hype', 'reflective', 'distill', 'hyper']
Defaults to hype.
metric (str): Metric to use for optimization.
problem_description (str): a string that contains
Expand Down Expand Up @@ -297,7 +289,7 @@ def run(
prompt=start_prompt,
task=task,
problem_description=problem_description,
num_samples=generate_num_samples
num_samples=generate_num_samples,
)
self.synthetic_dataset = dataset
self.synthetic_target = target
Expand Down Expand Up @@ -329,10 +321,21 @@ def run(
logger.debug(f"Additional kwargs: {kwargs}")

if method is Method.HYPE:
final_prompt = hype_optimizer(
hype_opt = HyPEOptimizer(model=self._target_model)
final_prompt = hype_opt.optimize(
prompt=start_prompt,
meta_info={"task_description": problem_description},
)
elif method is Method.HYPER:
hyper_opt = HyPEROptimizer(
model=self._target_model,
evaluator=evaluator,
**kwargs,
)
final_prompt = hyper_opt.optimize(
prompt=start_prompt,
problem_description=problem_description,
dataset_split=dataset_split,
meta_info={"task_description": problem_description},
)
elif method is Method.REFLECTIVE:
final_prompt = reflectiveprompt(
Expand Down Expand Up @@ -360,7 +363,7 @@ def run(
)

logger.debug(f"Final prompt:\n{final_prompt}")
template = self.TEMPLATE_MAP[(task, method)]
template = self.TEMPLATE_MAP[task]
logger.info(f"Evaluating on given dataset for {task} task...")
self.init_metric = evaluator.evaluate(
prompt=start_prompt,
Expand Down
6 changes: 2 additions & 4 deletions coolprompt/data_generator/generator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import json
from typing import Optional, List, Tuple, Any

import dirtyjson
from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages.ai import AIMessage
Expand Down Expand Up @@ -52,11 +50,11 @@ def _generate(
Returns:
Any: generated data
"""
if hasattr(self.model, 'model'):
if hasattr(self.model, "model"):
wrapped_model = self.model.model
else:
wrapped_model = self.model

if not isinstance(wrapped_model, BaseChatModel):
output = self.model.invoke(request)
if isinstance(output, AIMessage):
Expand Down
107 changes: 73 additions & 34 deletions coolprompt/evaluator/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import random
from langchain_core.language_models.base import BaseLanguageModel
from typing import Optional
from dataclasses import dataclass
from typing import List, Optional

from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.messages.ai import AIMessage
from coolprompt.evaluator.metrics import BaseMetric
from coolprompt.utils.logging_config import logger
Expand All @@ -12,6 +12,22 @@
)


@dataclass
class FailedExampleDetailed:
instance: str
assistant_answer: str
model_answer_parsed: Optional[str] = None
metric_value: float | int = 0.0
ground_truth: str | int = ""


@dataclass
class EvalResultDetailed:
aggregate_score: float
score_per_task: List[float | int] = None
failed_examples: List[FailedExampleDetailed] = None


class Evaluator:
"""Evaluator class to perform model evaluation using a specified metric.

Expand All @@ -35,29 +51,17 @@ def evaluate(
targets: list[str | int],
template: Optional[str] = None,
) -> float:
"""
Evaluate the model on a dataset
by generating answers and computing the metric.

For each sample in the dataset,
the prompt is concatenated with the sample,
passed to the model to generate an output,
and then all outputs are evaluated
against the targets using the metric.
"""Evaluate the model on a dataset.

Args:
prompt (str): The prompt string to prepend to each dataset sample.
dataset (list[str]): List of input samples to evaluate.
targets (list[str|int]):
Corresponding ground truth labels or references.
template (Optional[str]):
Prompt template for defined task type.
If None, uses default template.
targets (list[str|int]): Corresponding ground truth labels.
template (Optional[str]): Prompt template for defined task type.

Returns:
float: The computed evaluation metric score.
"""

if template is None:
template = self._get_default_template()

Expand All @@ -80,28 +84,64 @@ def evaluate(

return self.metric.compute(answers, targets, dataset)

def _get_full_prompt(
def evaluate_detailed(
self,
prompt: str,
sample: str,
dataset: list[str],
targets: list[str | int],
template: Optional[str] = None,
) -> str:
"""Inserts parts of the prompt into the task template.
) -> EvalResultDetailed:
"""Evaluate the model and return detailed results per sample."""
if template is None:
template = self._get_default_template()

Args:
prompt (str): the main instruction for the task
sample (str): the input sample
template (Optional[str]):
Prompt template for defined task type.
If None, uses default template.
logger.info(
f"Evaluating (detailed) prompt for {self.task} task on {len(dataset)} samples"
)
if self.task == Task.CLASSIFICATION:
self.metric.extract_labels(targets)

answers = self.model.batch(
[
self._get_full_prompt(prompt, sample, template)
for sample in dataset
]
)
answers = [
a.content if isinstance(a, AIMessage) else a for a in answers
]

Raises:
ValueError: if type of task is not supported
parsed_answers = [self.metric.parse_output(a) for a in answers]
aggregate_score, score_per_task = self.metric.compute_detailed(
answers, targets
)

Returns:
str: the full prompt to be passed to the model
"""
failed_examples = []
for i, score in enumerate(score_per_task):
if score == 0:
failed_examples.append(
FailedExampleDetailed(
instance=dataset[i],
assistant_answer=answers[i],
model_answer_parsed=parsed_answers[i],
metric_value=score,
ground_truth=targets[i],
)
)

return EvalResultDetailed(
aggregate_score=aggregate_score,
score_per_task=score_per_task,
failed_examples=failed_examples,
)

def _get_full_prompt(
self,
prompt: str,
sample: str,
template: Optional[str] = None,
) -> str:
"""Inserts parts of the prompt into the task template."""
if template is None:
template = self._get_default_template()

Expand All @@ -116,7 +156,6 @@ def _get_full_prompt(

def _get_default_template(self) -> str:
"""Returns the default template for the task type."""

match self.task:
case Task.CLASSIFICATION:
return CLASSIFICATION_TASK_TEMPLATE
Expand Down
Loading