Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,4 @@ cython_debug/
# VSCode
.vscode
.azure
test_output/
38 changes: 37 additions & 1 deletion notebooks/.env.sample
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
AZURE_AI_ENDPOINT=
# Azure Content Understanding Service Configuration
# Copy this file to <repository-root>/.env and update with your actual values

# Your Azure Content Understanding service endpoint
# Example: https://your-resource-name.services.ai.azure.com/
# If you need help to create one, please see the Prerequisites section in:
# https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-rest-api?tabs=document#prerequisites
# As of 2025/05, 2025-05-01-preview is only available in the regions documented in
# Content Understanding region and language support (https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/language-region-support).

# Azure Content Understanding Test Configuration

# Required for Content Understanding SDK and testing
AZURE_CONTENT_UNDERSTANDING_ENDPOINT=https://your-resource-name.services.ai.azure.com/

# Authentication Options:
# Option 1: Use Azure Key (FOR TESTING ONLY - Less secure)
# Set this value if you want to use key-based authentication
# WARNING: Keys are less secure and should only be used for testing/development
# Leave empty to use DefaultAzureCredential (recommended)
AZURE_CONTENT_UNDERSTANDING_KEY=

# Option 2: Use DefaultAzureCredential (RECOMMENDED for production and development)
# If AZURE_CONTENT_UNDERSTANDING_KEY is empty, the script will use DefaultAzureCredential
#
# Most common development scenario:
# 1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
# 2. Login: az login
# 3. Run the script (no additional configuration needed)
#
# This also supports:
# - Environment variables (AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID)
# - Managed Identity (for Azure-hosted applications)
# - Visual Studio Code authentication
# - Azure PowerShell authentication
# For more info: https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme#defaultazurecredential

236 changes: 175 additions & 61 deletions notebooks/analyzer_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@
"metadata": {},
"outputs": [],
"source": [
"analyzer_template = \"../analyzer_templates/receipt.json\"\n",
"training_docs_folder = \"../data/document_training\""
]
},
Expand Down Expand Up @@ -88,30 +87,45 @@
"import json\n",
"import os\n",
"import sys\n",
"from pathlib import Path\n",
"from dotenv import find_dotenv, load_dotenv\n",
"from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n",
"from datetime import datetime\n",
"import uuid\n",
"from dotenv import load_dotenv\n",
"from azure.storage.blob import ContainerSasPermissions\n",
"from azure.core.credentials import AzureKeyCredential\n",
"from azure.identity import DefaultAzureCredential\n",
"from azure.ai.contentunderstanding.aio import ContentUnderstandingClient\n",
"from azure.ai.contentunderstanding.models import (\n",
" ContentAnalyzer,\n",
" FieldSchema,\n",
" FieldDefinition,\n",
" FieldType,\n",
" GenerationMethod,\n",
" AnalysisMode,\n",
" ProcessingLocation,\n",
")\n",
"\n",
"# Import utility package from the Python samples root directory\n",
"parent_dir = Path(Path.cwd()).parent\n",
"sys.path.append(str(parent_dir))\n",
"from python.content_understanding_client import AzureContentUnderstandingClient\n",
"# Add the parent directory to the Python path to import the sample_helper module\n",
"sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'python'))\n",
"from extension.document_processor import DocumentProcessor\n",
"from extension.sample_helper import extract_operation_id_from_poller, PollerType, save_json_to_file\n",
"\n",
"load_dotenv(find_dotenv())\n",
"load_dotenv()\n",
"logging.basicConfig(level=logging.INFO)\n",
"\n",
"credential = DefaultAzureCredential()\n",
"token_provider = get_bearer_token_provider(credential, \"https://cognitiveservices.azure.com/.default\")\n",
"\n",
"client = AzureContentUnderstandingClient(\n",
" endpoint=os.getenv(\"AZURE_AI_ENDPOINT\"),\n",
" api_version=os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\"),\n",
" # IMPORTANT: Comment out token_provider if using subscription key\n",
" token_provider=token_provider,\n",
" # IMPORTANT: Uncomment this if using subscription key\n",
" # subscription_key=os.getenv(\"AZURE_AI_API_KEY\"),\n",
" x_ms_useragent=\"azure-ai-content-understanding-python/analyzer_training\", # This header is used for sample usage telemetry; please comment out this line if you want to opt out.\n",
")"
"endpoint = os.environ.get(\"AZURE_CONTENT_UNDERSTANDING_ENDPOINT\")\n",
"# Return AzureKeyCredential if AZURE_CONTENT_UNDERSTANDING_KEY is set, otherwise DefaultAzureCredential\n",
"key = os.getenv(\"AZURE_CONTENT_UNDERSTANDING_KEY\")\n",
"credential = AzureKeyCredential(key) if key else DefaultAzureCredential()\n",
"# Create the ContentUnderstandingClient\n",
"client = ContentUnderstandingClient(endpoint=endpoint, credential=credential)\n",
"print(\"✅ ContentUnderstandingClient created successfully\")\n",
"\n",
"try:\n",
" processor = DocumentProcessor(client)\n",
" print(\"✅ DocumentProcessor created successfully\")\n",
"except Exception as e:\n",
" print(f\"❌ Failed to create DocumentProcessor: {e}\")\n",
" raise"
]
},
{
Expand All @@ -133,26 +147,27 @@
"metadata": {},
"outputs": [],
"source": [
"# Load reference storage configuration from environment\n",
"training_data_path = os.getenv(\"TRAINING_DATA_PATH\") or f\"training_data_{uuid.uuid4().hex[:8]}\"\n",
"training_data_sas_url = os.getenv(\"TRAINING_DATA_SAS_URL\")\n",
"\n",
"if not training_data_path.endswith(\"/\"):\n",
" training_data_sas_url += \"/\"\n",
"\n",
"if not training_data_sas_url:\n",
" TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n",
" TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n",
" if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not training_data_sas_url:\n",
" raise ValueError(\n",
" \"Please set either TRAINING_DATA_SAS_URL or both TRAINING_DATA_STORAGE_ACCOUNT_NAME and TRAINING_DATA_CONTAINER_NAME environment variables.\"\n",
" training_data_storage_account_name = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n",
" training_data_container_name = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n",
"\n",
" if training_data_storage_account_name and training_data_container_name:\n",
" # We require \"Write\" permission to upload, modify, or append blobs\n",
" training_data_sas_url = processor.generate_container_sas_url(\n",
" account_name=training_data_storage_account_name,\n",
" container_name=training_data_container_name,\n",
" permissions=ContainerSasPermissions(read=True, write=True, list=True),\n",
" expiry_hours=1,\n",
" )\n",
" from azure.storage.blob import ContainerSasPermissions\n",
" # Requires \"Write\" (critical for upload/modify/append) along with \"Read\" and \"List\" for viewing/listing blobs.\n",
" training_data_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n",
" account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n",
" container_name=TRAINING_DATA_CONTAINER_NAME,\n",
" permissions=ContainerSasPermissions(read=True, write=True, list=True),\n",
" expiry_hours=1,\n",
" )\n",
"\n",
"training_data_path = os.getenv(\"TRAINING_DATA_PATH\")\n",
"\n",
"await client.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)"
"\n",
"await processor.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)"
]
},
{
Expand All @@ -162,7 +177,7 @@
"## Create Analyzer with Defined Schema\n",
"Before creating the analyzer, fill in the constant `ANALYZER_ID` with a relevant name for your task. In this example, we generate a unique suffix so that this cell can be run multiple times to create different analyzers.\n",
"\n",
"We use **training_data_sas_url** and **training_data_path** as set in the [.env](./.env) file and used in the previous step."
"We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** as set in the [.env](./.env) file and used in the previous step."
]
},
{
Expand All @@ -171,24 +186,78 @@
"metadata": {},
"outputs": [],
"source": [
"import uuid\n",
"CUSTOM_ANALYZER_ID = \"train-sample-\" + str(uuid.uuid4())\n",
"analyzer_id = f\"analyzer-training-sample-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{uuid.uuid4().hex[:8]}\"\n",
"\n",
"content_analyzer = ContentAnalyzer(\n",
" base_analyzer_id=\"prebuilt-documentAnalyzer\",\n",
" description=\"Extract useful information from receipt\",\n",
" field_schema=FieldSchema(\n",
" name=\"receipt schema\",\n",
" description=\"Schema for receipt\",\n",
" fields={\n",
" \"MerchantName\": FieldDefinition(\n",
" type=FieldType.STRING,\n",
" method=GenerationMethod.EXTRACT,\n",
" description=\"\"\n",
" ),\n",
" \"Items\": FieldDefinition(\n",
" type=FieldType.ARRAY,\n",
" method=GenerationMethod.GENERATE,\n",
" description=\"\",\n",
" items_property={\n",
" \"type\": \"object\",\n",
" \"method\": \"extract\",\n",
" \"properties\": {\n",
" \"Quantity\": {\n",
" \"type\": \"string\",\n",
" \"method\": \"extract\",\n",
" \"description\": \"\"\n",
" },\n",
" \"Name\": {\n",
" \"type\": \"string\",\n",
" \"method\": \"extract\",\n",
" \"description\": \"\"\n",
" },\n",
" \"Price\": {\n",
" \"type\": \"string\",\n",
" \"method\": \"extract\",\n",
" \"description\": \"\"\n",
" }\n",
" }\n",
" }\n",
" ),\n",
" \"TotalPrice\": FieldDefinition(\n",
" type=FieldType.STRING,\n",
" method=GenerationMethod.EXTRACT,\n",
" description=\"\"\n",
" )\n",
" }\n",
" ),\n",
" mode=AnalysisMode.STANDARD,\n",
" processing_location=ProcessingLocation.GEOGRAPHY,\n",
" tags={\"demo_type\": \"get_result\"},\n",
" training_data={\n",
" \"kind\": \"blob\",\n",
" \"containerUrl\": training_data_sas_url,\n",
" \"prefix\": training_data_path\n",
" },\n",
")\n",
"print(f\"🔧 Creating custom analyzer '{analyzer_id}'...\")\n",
"poller = await client.content_analyzers.begin_create_or_replace(\n",
" analyzer_id=analyzer_id,\n",
" resource=content_analyzer,\n",
")\n",
"\n",
"response = client.begin_create_analyzer(\n",
" CUSTOM_ANALYZER_ID,\n",
" analyzer_template_path=analyzer_template,\n",
" training_storage_container_sas_url=training_data_sas_url,\n",
" training_storage_container_path_prefix=training_data_path,\n",
"# Extract operation ID from the poller\n",
"operation_id = extract_operation_id_from_poller(\n",
" poller, PollerType.ANALYZER_CREATION\n",
")\n",
"result = client.poll_result(response)\n",
"if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n",
" logging.info(f\"Analyzer details for {result['result']['analyzerId']}\")\n",
" logging.info(json.dumps(result, indent=2))\n",
"else:\n",
" logging.warning(\n",
" \"An issue was encountered when trying to create the analyzer. \"\n",
" \"Please double-check your deployment and configurations for potential problems.\"\n",
" )"
"print(f\"📋 Extracted creation operation ID: {operation_id}\")\n",
"\n",
"# Wait for the analyzer to be created\n",
"print(f\"⏳ Waiting for analyzer creation to complete...\")\n",
"await poller.result()\n",
"print(f\"✅ Analyzer '{analyzer_id}' created successfully!\")"
]
},
{
Expand All @@ -205,10 +274,53 @@
"metadata": {},
"outputs": [],
"source": [
"response = client.begin_analyze(CUSTOM_ANALYZER_ID, file_location='../data/receipt.png')\n",
"result_json = client.poll_result(response)\n",
"file_path = \"../data/receipt.png\"\n",
"print(f\"📄 Reading document file: {file_path}\")\n",
"with open(file_path, \"rb\") as f:\n",
" data_content = f.read()\n",
"\n",
"# Begin document analysis operation\n",
"print(f\"🔍 Starting document analysis with analyzer '{analyzer_id}'...\")\n",
"analysis_poller = await client.content_analyzers.begin_analyze_binary(\n",
" analyzer_id=analyzer_id, \n",
" input=data_content,\n",
" content_type=\"application/octet-stream\")\n",
"\n",
"# Wait for analysis completion\n",
"print(f\"⏳ Waiting for document analysis to complete...\")\n",
"analysis_result = await analysis_poller.result()\n",
"print(f\"✅ Document analysis completed successfully!\")\n",
"\n",
" # Extract operation ID for get_result\n",
"analysis_operation_id = extract_operation_id_from_poller(\n",
" analysis_poller, PollerType.ANALYZE_CALL\n",
")\n",
"print(f\"📋 Extracted analysis operation ID: {analysis_operation_id}\")\n",
"\n",
"# Get the analysis result using the operation ID\n",
"print(\n",
" f\"🔍 Getting analysis result using operation ID '{analysis_operation_id}'...\"\n",
")\n",
"operation_status = await client.content_analyzers.get_result(\n",
" operation_id=analysis_operation_id,\n",
")\n",
"\n",
"print(f\"✅ Analysis result retrieved successfully!\")\n",
"print(f\" Operation ID: {operation_status.id}\")\n",
"print(f\" Status: {operation_status.status}\")\n",
"\n",
"logging.info(json.dumps(result_json, indent=2))"
"# The actual analysis result is in operation_status.result\n",
"operation_result = operation_status.result\n",
"if operation_result is None:\n",
" print(\"⚠️ No analysis result available\")\n",
"\n",
"print(f\"📄 Analysis Result: {json.dumps(operation_result.as_dict())}\")\n",
"\n",
"# Save the analysis result to a file\n",
"saved_file_path = save_json_to_file(\n",
" result=operation_result.as_dict(),\n",
" filename_prefix=\"analyzer_training_get_result\",\n",
")"
]
},
{
Expand All @@ -225,13 +337,15 @@
"metadata": {},
"outputs": [],
"source": [
"client.delete_analyzer(CUSTOM_ANALYZER_ID)"
"print(f\"🗑️ Deleting analyzer '{analyzer_id}' (demo cleanup)...\")\n",
"await client.content_analyzers.delete(analyzer_id=analyzer_id)\n",
"print(f\"✅ Analyzer '{analyzer_id}' deleted successfully!\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "py312",
"language": "python",
"name": "python3"
},
Expand All @@ -245,7 +359,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
Loading