Merge pull request #106 from Azure-Samples/update-package

pamelafox · web-flow · commit b0697bfbc33f · 2024-10-03T05:55:02.000-07:00
Update to azure-ai-evaluations
diff --git a/.env.sample b/.env.sample
@@ -2,12 +2,14 @@ OPENAI_HOST="azure"
 OPENAI_GPT_MODEL="gpt-4"
 # For Azure OpenAI only:
 AZURE_OPENAI_EVAL_DEPLOYMENT="<deployment-name>"
-AZURE_OPENAI_SERVICE="<service-name>"
+AZURE_OPENAI_ENDPOINT="https://<service-name>.openai.azure.com"
 AZURE_OPENAI_KEY=""
+AZURE_OPENAI_TENANT_ID=""
 # For openai.com only:
 OPENAICOM_KEY=""
 OPENAICOM_ORGANIZATION=""
 # For generating QA based on search index:
-AZURE_SEARCH_SERVICE="<service-name>"
+AZURE_SEARCH_ENDPOINT="https://<service-name>.search.windows.net"
 AZURE_SEARCH_INDEX="<index-name>"
 AZURE_SEARCH_KEY=""
+AZURE_SEARCH_TENANT_ID=""
diff --git a/.github/workflows/azure-dev.yaml b/.github/workflows/azure-dev.yaml
@@ -28,7 +28,7 @@ jobs:
       AZURE_CREDENTIALS: ${{ secrets.AZURE_CREDENTIALS }}
       # project specific
       OPENAI_HOST: ${{ vars.OPENAI_HOST }}
-      AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE }}
+      AZURE_OPENAI_ENDPOINT: ${{ vars.AZURE_OPENAI_ENDPOINT }}
       AZURE_OPENAI_RESOURCE_GROUP: ${{ vars.AZURE_OPENAI_RESOURCE_GROUP }}
       OPENAI_ORGANIZATION: ${{ vars.OPENAI_ORGANIZATION }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/README.md b/README.md
@@ -57,18 +57,11 @@ We've made that easy to deploy with the `azd` CLI tool.
 1. Install the [Azure Developer CLI](https://aka.ms/azure-dev/install)
 2. Run `azd auth login` to log in to your Azure account
 3. Run `azd up` to deploy a new GPT-4 instance
-4. Create a `.env` file based on the provisioned resources by running one of the following commands.
-
-    Bash:
+4. Create a `.env` file based on the provisioned resources by copying `.env.sample` and filling in the required values.
+   You can run this command to see the deployed values:
 
     ```shell
-    azd env get-values > .env
-    ```
-
-    PowerShell:
-
-    ```powershell
-    $output = azd env get-values; Add-Content -Path .env -Value $output;
+    azd env get-values
     ```
 
 ### Using an existing Azure OpenAI instance
@@ -80,7 +73,7 @@ If you already have an Azure OpenAI instance, you can use that instead of creati
 
     ```shell
     AZURE_OPENAI_EVAL_DEPLOYMENT="<deployment-name>"
-    AZURE_OPENAI_SERVICE="<service-name>"
+    AZURE_OPENAI_ENDPOINT="https://<service-name>.openai.azure.com"
     ```
 
 3. The scripts default to keyless access (via `AzureDefaultCredential`), but you can optionally use a key by setting `AZURE_OPENAI_KEY` in `.env`.
@@ -129,7 +122,7 @@ This repo includes a script for generating questions and answers from documents
 2. Fill in the values for your Azure AI Search instance:
 
     ```shell
-    AZURE_SEARCH_SERVICE="<service-name>"
+    AZURE_SEARCH_ENDPOINT="https://<service-name>.search.windows.net"
     AZURE_SEARCH_INDEX="<index-name>"
     AZURE_SEARCH_KEY=""
     ```
diff --git a/azure.yaml b/azure.yaml
@@ -6,7 +6,7 @@ metadata:
 pipeline:
   variables:
     - OPENAI_HOST
-    - AZURE_OPENAI_SERVICE
+    - AZURE_OPENAI_ENDPOINT
     - AZURE_OPENAI_RESOURCE_GROUP
     - OPENAI_ORGANIZATION
   secrets:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,12 @@
 [tool.ruff]
 line-length = 120
-target-version = "py311"
+target-version = "py39"
 lint.select = ["E", "F", "I", "UP"]
 lint.ignore = ["D203"]
 
 [tool.black]
 line-length = 120
-target-version = ["py311"]
+target-version = ["py39"]
 
 [tool.pytest.ini_options]
 addopts = "-ra"
diff --git a/scripts/evaluate.py b/scripts/evaluate.py
@@ -114,16 +114,12 @@ def run_evaluation(
         return False
 
     logger.info("Sending a test chat completion to the GPT deployment to ensure it is running...")
-    try:
-        gpt_response = service_setup.get_openai_client(openai_config).chat.completions.create(
-            model=openai_config.model,
-            messages=[{"role": "user", "content": "Hello!"}],
-            n=1,
-        )
-        logger.info('Successfully received response from GPT: "%s"', gpt_response.choices[0].message.content)
-    except Exception as e:
-        logger.error("Failed to send a test chat completion to the GPT deployment due to error: \n%s", e)
-        return False
+    gpt_response = service_setup.get_openai_client(openai_config).chat.completions.create(
+        model=openai_config["model"],
+        messages=[{"role": "user", "content": "Hello!"}],
+        n=1,
+    )
+    logger.info('Successfully received response from GPT: "%s"', gpt_response.choices[0].message.content)
 
     logger.info("Starting evaluation...")
     for metric in requested_metrics:
@@ -149,8 +145,8 @@ def evaluate_row(row):
         output.update(target_response)
         for metric in requested_metrics:
             result = metric.evaluator_fn(openai_config=openai_config)(
-                question=row["question"],
-                answer=output["answer"],
+                query=row["question"],
+                response=output["answer"],
                 context=output["context"],
                 ground_truth=row["truth"],
             )
@@ -183,7 +179,7 @@ def evaluate_row(row):
 
     with open(results_dir / "evaluate_parameters.json", "w", encoding="utf-8") as parameters_file:
         parameters = {
-            "evaluation_gpt_model": openai_config.model,
+            "evaluation_gpt_model": openai_config["model"],
             "evaluation_timestamp": int(time.time()),
             "testdata_path": str(testdata_path),
             "target_url": target_url,
diff --git a/scripts/evaluate_metrics/builtin_metrics.py b/scripts/evaluate_metrics/builtin_metrics.py
@@ -1,4 +1,4 @@
-from promptflow.evals.evaluators import (
+from azure.ai.evaluation import (
     CoherenceEvaluator,
     F1ScoreEvaluator,
     FluencyEvaluator,
diff --git a/scripts/evaluate_metrics/code_metrics.py b/scripts/evaluate_metrics/code_metrics.py
@@ -12,11 +12,11 @@ class AnswerLengthMetric(BaseMetric):
 
     @classmethod
     def evaluator_fn(cls, **kwargs):
-        def answer_length(*, answer, **kwargs):
-            if answer is None:
-                logger.warning("Received answer of None, can't compute answer_length metric. Setting to -1.")
+        def answer_length(*, response, **kwargs):
+            if response is None:
+                logger.warning("Received response of None, can't compute answer_length metric. Setting to -1.")
                 return {cls.METRIC_NAME: -1}
-            return {cls.METRIC_NAME: len(answer)}
+            return {cls.METRIC_NAME: len(response)}
 
         return answer_length
 
@@ -37,11 +37,11 @@ class HasCitationMetric(BaseMetric):
 
     @classmethod
     def evaluator_fn(cls, **kwargs):
-        def has_citation(*, answer, **kwargs):
-            if answer is None:
-                logger.warning("Received answer of None, can't compute has_citation metric. Setting to -1.")
+        def has_citation(*, response, **kwargs):
+            if response is None:
+                logger.warning("Received response of None, can't compute has_citation metric. Setting to -1.")
                 return {cls.METRIC_NAME: -1}
-            return {cls.METRIC_NAME: bool(re.search(r"\[[^\]]+\]", answer))}
+            return {cls.METRIC_NAME: bool(re.search(r"\[[^\]]+\]", response))}
 
         return has_citation
 
@@ -60,14 +60,14 @@ class CitationMatchMetric(BaseMetric):
 
     @classmethod
     def evaluator_fn(cls, **kwargs):
-        def citation_match(*, answer, ground_truth, **kwargs):
-            if answer is None:
-                logger.warning("Received answer of None, can't compute citation_match metric. Setting to -1.")
+        def citation_match(*, response, ground_truth, **kwargs):
+            if response is None:
+                logger.warning("Received response of None, can't compute citation_match metric. Setting to -1.")
                 return {cls.METRIC_NAME: -1}
-            # Return true if all citations in the truth are present in the answer
+            # Return true if all citations in the truth are present in the response
             truth_citations = set(re.findall(r"\[([^\]]+)\.\w{3,4}(#page=\d+)*\]", ground_truth))
-            answer_citations = set(re.findall(r"\[([^\]]+)\.\w{3,4}(#page=\d+)*\]", answer))
-            citation_match = truth_citations.issubset(answer_citations)
+            response_citations = set(re.findall(r"\[([^\]]+)\.\w{3,4}(#page=\d+)*\]", response))
+            citation_match = truth_citations.issubset(response_citations)
             return {cls.METRIC_NAME: citation_match}
 
         return citation_match
diff --git a/scripts/evaluate_metrics/prompts/dontknowness.prompty b/scripts/evaluate_metrics/prompts/dontknowness.prompty
@@ -27,7 +27,7 @@ sample:
   answer: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
 ---
 system:
-You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
 
 user:
 The "I don't know"-ness metric is a measure of how much an answer conveys the lack of knowledge or uncertainty, which is useful for making sure a chatbot for a particular domain doesn't answer outside that domain. Score the I-dont-know-ness of the answer between one to five stars using the following rating scale:
@@ -59,6 +59,6 @@ question: Where were The Beatles formed?
 answer: I'm sorry, I don't know, that answer is not in my sources.
 stars: 5
 
-question: {{question}}
-answer: {{answer}}
+question: {{query}}
+answer: {{response}}
 stars:
diff --git a/scripts/evaluate_metrics/prompts/mycoherence.prompty b/scripts/evaluate_metrics/prompts/mycoherence.prompty
@@ -6,7 +6,6 @@ model:
   configuration:
     type: azure_openai
     azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
     azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
@@ -18,11 +17,14 @@ model:
       type: text
 
 inputs:
-  question:
+  query:
     type: string
-  answer:
+  response:
     type: string
 
+sample:
+  query: What are the main goals of Perseverance Mars rover mission?
+  response: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
 ---
 system:
 You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
@@ -57,6 +59,6 @@ question: What can you tell me about climate change and its effects on the envir
 answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike.
 stars: 5
 
-question: {{question}}
-answer: {{answer}}
+question: {{query}}
+answer: {{response}}
 stars:
diff --git a/scripts/evaluate_metrics/prompts/mygroundedness.prompty b/scripts/evaluate_metrics/prompts/mygroundedness.prompty
@@ -6,7 +6,6 @@ model:
   configuration:
     type: azure_openai
     azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
     azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
@@ -18,11 +17,14 @@ model:
       type: text
 
 inputs:
-  answer:
+  response:
     type: string
   context:
     type: string
 
+sample:
+  context: The Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
+  response: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
 ---
 system:
 You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
@@ -49,6 +51,6 @@ Independent Examples:
 ## Example Task #4 Output:
 1
 ## Actual Task Input:
-{"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}}
+{"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{response}}}
 Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question.
 Actual Task Output:
diff --git a/scripts/evaluate_metrics/prompts/myrelevance.prompty b/scripts/evaluate_metrics/prompts/myrelevance.prompty
@@ -6,7 +6,6 @@ model:
   configuration:
     type: azure_openai
     azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
     azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
@@ -18,13 +17,17 @@ model:
       type: text
 
 inputs:
-  question:
+  query:
     type: string
-  answer:
+  response:
     type: string
   context:
     type: string
 
+sample:
+  question: What are the main goals of Perseverance Mars rover mission?
+  answer: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
+  context: The Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
 ---
 system:
 You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
@@ -64,6 +67,6 @@ answer: The main attractions of the Queen's Royal Castle are its expansive 500-a
 stars: 5
 
 context: {{context}}
-question: {{question}}
-answer: {{answer}}
+question: {{query}}
+answer: {{response}}
 stars:
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -1,7 +1,7 @@
 requests
 python-dotenv
 azure-ai-generative[evaluate]==1.0.0b8
-promptflow-evals==0.3.2
+azure-ai-evaluation==1.0.0b3
 azure-search-documents
 typer
 openai>=1.0.0
diff --git a/scripts/service_setup.py b/scripts/service_setup.py
diff --git a/scripts/tests/test_evaluate_metrics.py b/scripts/tests/test_evaluate_metrics.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from promptflow.evals.evaluators import (`
	`1`	`+from azure.ai.evaluation import (`
`2`	`2`	`CoherenceEvaluator,`
`3`	`3`	`F1ScoreEvaluator,`
`4`	`4`	`FluencyEvaluator,`