Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/.trigger
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Dummy file to trigger workflow
# Modify the timestamp below to force workflow runs
Last updated: 2024-11-12 00:00:00

260 changes: 237 additions & 23 deletions .github/workflows/container-validation-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- main
- "pull-request/[0-9]+"
- release/*.*.*
workflow_dispatch: # Allow manual triggering for testing

concurrency:
# The group name is a ternary operation. If the ref_name is 'main',
Expand Down Expand Up @@ -43,7 +44,8 @@

operator:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
# Temporarily override skip condition for testing
if: true # TODO: Restore to: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -119,7 +121,8 @@

vllm:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
# Temporarily override skip condition for testing
if: true # TODO: Restore to: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -184,7 +187,8 @@

sglang:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
# Temporarily override skip condition for testing
if: true # TODO: Restore to: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -247,7 +251,8 @@

trtllm:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
# Temporarily override skip condition for testing
if: true # TODO: Restore to: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -452,7 +457,7 @@
name: Upload Workflow Metrics
runs-on: gitlab
if: always() # Always run, even if other jobs fail
needs: [backend-status-check] # Wait for the status check which waits for all build jobs

Check failure on line 460 in .github/workflows/container-validation-backends.yml

View workflow job for this annotation

GitHub Actions / deploy-test-sglang (agg_router)

Deploy Test Failed: sglang (agg_router)

Unknown error occurred during deployment test

Check failure on line 460 in .github/workflows/container-validation-backends.yml

View workflow job for this annotation

GitHub Actions / deploy-test-sglang (agg)

Deploy Test Failed: sglang (agg)

Unknown error occurred during deployment test

Check failure on line 460 in .github/workflows/container-validation-backends.yml

View workflow job for this annotation

GitHub Actions / deploy-test-vllm (agg_router)

Deploy Test Failed: vllm (agg_router)

Unknown error occurred during deployment test

Check failure on line 460 in .github/workflows/container-validation-backends.yml

View workflow job for this annotation

GitHub Actions / deploy-test-vllm (agg)

Deploy Test Failed: vllm (agg)

Unknown error occurred during deployment test

Check failure on line 460 in .github/workflows/container-validation-backends.yml

View workflow job for this annotation

GitHub Actions / deploy-test-trtllm (agg_router)

Deploy Test Failed: trtllm (agg_router)

Unknown error occurred during deployment test

Check failure on line 460 in .github/workflows/container-validation-backends.yml

View workflow job for this annotation

GitHub Actions / deploy-test-trtllm (agg)

Deploy Test Failed: trtllm (agg)

Unknown error occurred during deployment test

Check failure on line 460 in .github/workflows/container-validation-backends.yml

View workflow job for this annotation

GitHub Actions / deploy-test-trtllm (disagg)

Deploy Test Failed: trtllm (disagg)

Unknown error occurred during deployment test

Check failure on line 460 in .github/workflows/container-validation-backends.yml

View workflow job for this annotation

GitHub Actions / deploy-test-vllm (disagg)

Deploy Test Failed: vllm (disagg)

Unknown error occurred during deployment test

steps:
- name: Check out repository
Expand Down Expand Up @@ -499,7 +504,8 @@

deploy-operator:
runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true'
# Temporarily override skip condition for testing - run even if dependencies fail
if: always() # TODO: Restore to: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, operator, vllm, sglang, trtllm]
env:
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
Expand All @@ -513,6 +519,9 @@
BRANCH: ${{ github.ref_name }}
run: |
set -x

# Redirect all output to a log file while still showing it
exec > >(tee -a deploy-operator.log) 2>&1

# Set namespace
# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
Expand All @@ -521,6 +530,7 @@
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
echo "NAMESPACE=${NAMESPACE}" >> $GITHUB_ENV

# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
Expand Down Expand Up @@ -555,23 +565,144 @@
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}

# Check if Helm is available
echo "Checking Helm availability..."
if ! command -v helm &> /dev/null; then
ERROR_MSG="Helm is not installed or not available in PATH. Helm installation may have failed during runner setup."
echo "$ERROR_MSG"
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
exit 1
fi

echo "Helm version: $(helm version --short)"

# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
echo "Installing Helm dependencies..."
if ! helm repo add bitnami https://charts.bitnami.com/bitnami 2>&1; then
ERROR_MSG="Failed to add Helm bitnami repository"
echo "$ERROR_MSG"
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
exit 1
fi

cd deploy/cloud/helm/platform/
helm dep build .

if ! helm dep build . 2>&1; then
ERROR_MSG="Failed to build Helm dependencies"
echo "$ERROR_MSG"
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
exit 1
fi

# Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
echo "Installing dynamo-platform Helm chart..."
if ! helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret 2>&1; then
ERROR_MSG="Failed to install dynamo-platform Helm chart. This may be due to: pre-install hook timeout, image pull failures, or resource constraints."
echo "$ERROR_MSG"

# Capture additional diagnostics
echo "=== Pod Status ==="
kubectl get pods -n ${NAMESPACE} -o wide || true
echo "=== Events ==="
kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp' | tail -20 || true
echo "=== Helm Status ==="
helm status dynamo-platform -n ${NAMESPACE} || true

echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
exit 1
fi
Comment on lines +599 to +619
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Variable naming inconsistency and potential unsafe output concatenation.

Line 454 uses ERROR_MSG instead of ERROR_MESSAGE (used elsewhere). More critically, diagnostic outputs (pods, events) are concatenated into ERROR_MESSAGE without sanitization before being stored in GITHUB_ENV. If any output contains newlines or special characters, this could corrupt the environment variable.

Standardize the variable name and escape diagnostic output:

         if ! helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
           --set dynamo-operator.namespaceRestriction.enabled=true \
           --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
           --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
           --set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
           --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret 2>&1; then
-          ERROR_MSG="Failed to install dynamo-platform Helm chart. This may be due to: pre-install hook timeout, image pull failures, or resource constraints."
+          ERROR_MESSAGE="Failed to install dynamo-platform Helm chart. This may be due to: pre-install hook timeout, image pull failures, or resource constraints."
           echo "$ERROR_MSG"

For the diagnostic capture, consider logging to a file artifact instead of concatenating into a single environment variable.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
echo "Installing dynamo-platform Helm chart..."
if ! helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret 2>&1; then
ERROR_MSG="Failed to install dynamo-platform Helm chart. This may be due to: pre-install hook timeout, image pull failures, or resource constraints."
echo "$ERROR_MSG"
# Capture additional diagnostics
echo "=== Pod Status ==="
kubectl get pods -n ${NAMESPACE} -o wide || true
echo "=== Events ==="
kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp' | tail -20 || true
echo "=== Helm Status ==="
helm status dynamo-platform -n ${NAMESPACE} || true
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
exit 1
fi
echo "Installing dynamo-platform Helm chart..."
if ! helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret 2>&1; then
ERROR_MESSAGE="Failed to install dynamo-platform Helm chart. This may be due to: pre-install hook timeout, image pull failures, or resource constraints."
echo "$ERROR_MESSAGE"
# Capture additional diagnostics
echo "=== Pod Status ==="
kubectl get pods -n ${NAMESPACE} -o wide || true
echo "=== Events ==="
kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp' | tail -20 || true
echo "=== Helm Status ==="
helm status dynamo-platform -n ${NAMESPACE} || true
echo "ERROR_MESSAGE=$ERROR_MESSAGE" >> $GITHUB_ENV
exit 1
fi
🤖 Prompt for AI Agents
In .github/workflows/container-validation-backends.yml around lines 447 to 467,
replace the inconsistent ERROR_MSG with ERROR_MESSAGE and stop writing raw
multi-line diagnostic output into GITHUB_ENV; instead capture the
pod/events/helm outputs to separate files (or a single diagnostics file) and
upload them as workflow artifacts (or base64-encode/sanitize before exporting)
while only writing a single-line, safe ERROR_MESSAGE to GITHUB_ENV (e.g.,
"Failed to install dynamo-platform; diagnostics saved to <path>" or the
base64-encoded diagnostics string) to avoid corrupting the environment file.


# Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
echo "Waiting for deployments to be ready..."
if ! timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch 2>&1; then
ERROR_MSG="Deployment rollout timed out after 300 seconds"
echo "$ERROR_MSG"

# Capture diagnostics
echo "=== Pod Status ==="
kubectl get pods -n ${NAMESPACE} -o wide || true
echo "=== Deployment Status ==="
kubectl get deployments -n ${NAMESPACE} || true

echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
exit 1
fi
continue-on-error: true

- name: Create GitHub Annotation on Failure
if: steps.deploy-operator-step.outcome == 'failure'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -x

# Extract error message from environment
ERROR_MESSAGE="${ERROR_MESSAGE:-Unknown error occurred during operator deployment}"

# Get additional context from Kubernetes if namespace is set
if [ -n "$NAMESPACE" ]; then
export KUBECONFIG=$(pwd)/.kubeconfig

# Get pod status
POD_STATUS=$(kubectl get pods -n $NAMESPACE -o json 2>/dev/null || echo "{}")
POD_ERRORS=$(echo "$POD_STATUS" | jq -r '.items[] | select(.status.phase != "Running") | "Pod: \(.metadata.name), Status: \(.status.phase), Reason: \(.status.containerStatuses[0].state.waiting.reason // .status.reason // "N/A"), Message: \(.status.containerStatuses[0].state.waiting.message // "N/A")"' 2>/dev/null | head -10 || echo "")

if [ -n "$POD_ERRORS" ]; then
ERROR_MESSAGE="$ERROR_MESSAGE\n\nPod Status:\n$POD_ERRORS"
fi

# Get recent events
EVENTS=$(kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' -o json 2>/dev/null || echo "{}")
ERROR_EVENTS=$(echo "$EVENTS" | jq -r '.items[] | select(.type == "Warning" or .type == "Error") | "\(.lastTimestamp) - \(.reason): \(.message)"' 2>/dev/null | tail -5 || echo "")

if [ -n "$ERROR_EVENTS" ]; then
ERROR_MESSAGE="$ERROR_MESSAGE\n\nRecent Events:\n$ERROR_EVENTS"
fi
fi
Comment on lines +653 to +667
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Diagnostic outputs appended to ERROR_MESSAGE without escaping.

Pod and event information are concatenated into ERROR_MESSAGE (lines 505, 513) without escaping newlines or special characters. This exacerbates the JSON escaping issue flagged earlier. When ERROR_MESSAGE is later embedded in the GitHub API payload, these unescaped values will corrupt the JSON.

Consider building a structured diagnostic object separately or properly escaping before concatenation.


# Create a check run with the annotation
CHECK_RUN_ID=$(curl -s -X POST \
-H "Authorization: token $GITHUB_TOKEN" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/check-runs" \
-d '{
"name": "Deploy Operator",
"head_sha": "${{ github.sha }}",
"status": "completed",
"conclusion": "failure",
"output": {
"title": "Operator Deployment Failed",
"summary": "Failed to deploy dynamo-platform operator to namespace '"${NAMESPACE}"'",
"text": "**Job**: deploy-operator\n**Namespace**: '"${NAMESPACE}"'\n\n**Error Details**:\n```\n'"${ERROR_MESSAGE}"'\n```\n\n[View Job Run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})",
"annotations": [{
"path": ".github/workflows/container-validation-backends.yml",
"start_line": 357,
"end_line": 425,
"annotation_level": "failure",
"message": "'"${ERROR_MESSAGE}"'",
"title": "Operator deployment failed"
}]
}
}' | jq -r '.id')
Comment on lines +669 to +692
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Unescaped ERROR_MESSAGE in JSON payload; hard-coded line numbers will become stale.

Line 536 interpolates ERROR_MESSAGE directly into JSON without escaping quotes or newlines. If the error message contains " or newline characters, the JSON will be malformed. Additionally, hard-coded line numbers (357, 425 at lines 533-534) are brittle; they must be updated whenever the file structure changes.

Apply proper JSON escaping and use dynamic line references:

          "annotations": [{
            "path": ".github/workflows/container-validation-backends.yml",
-           "start_line": 357,
-           "end_line": 425,
+           "start_line": ${{ steps.deploy-operator-step.step_id_line_start || 357 }},
+           "end_line": ${{ steps.deploy-operator-step.step_id_line_end || 425 }},
            "annotation_level": "failure",
-           "message": "'"${ERROR_MESSAGE}"'",
+           "message": "$(printf '%s\n' "${ERROR_MESSAGE}" | jq -Rs .)",
            "title": "Operator deployment failed"
          }]

Better yet, use a shell utility to properly escape JSON:

ERROR_MESSAGE_JSON=$(printf '%s\n' "${ERROR_MESSAGE}" | jq -Rs .)
# Then embed ERROR_MESSAGE_JSON in the payload


echo "Created check run with ID: $CHECK_RUN_ID"

# Also use the workflow command to create an error annotation
echo "::error file=.github/workflows/container-validation-backends.yml,line=367,title=Deploy Operator Failed::${ERROR_MESSAGE}"

# Fail the step
exit 1

deploy-test-vllm:
runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true'
# Temporarily override skip condition for testing - run even if dependencies fail
if: always() # TODO: Restore to: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, deploy-operator, vllm]
permissions:
contents: read
Expand Down Expand Up @@ -604,13 +735,17 @@
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config get-contexts
- name: Run Tests
id: run-tests
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE

# Redirect all output to a log file while still showing it
exec > >(tee -a test-output.log) 2>&1

cd examples/backends/$FRAMEWORK
export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
export KUBE_NS=$NAMESPACE
Expand Down Expand Up @@ -662,8 +797,9 @@
ATTEMPT=$((ATTEMPT + 1))
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Model $MODEL_NAME not found in /v1/models after $MAX_ATTEMPTS attempts"
echo "Last response: $MODELS_RESPONSE"
ERROR_MSG="Model $MODEL_NAME not found in /v1/models after $MAX_ATTEMPTS attempts. Last response: $MODELS_RESPONSE"
echo "$ERROR_MSG"
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
exit 1
fi
RESPONSE=$(curl -s -N --no-buffer --retry 10 --retry-delay 5 --retry-connrefused -X POST "${LLM_URL}/v1/chat/completions" \
Expand All @@ -683,26 +819,102 @@
}' 2>&1)
echo "Response: $RESPONSE"
TEST_RESULT=0
ERROR_MSG=""
if ! echo "$RESPONSE" | jq -e . >/dev/null 2>&1; then
echo "Test failed: Response is not valid JSON"
echo "Got: $RESPONSE"
ERROR_MSG="Test failed: Response is not valid JSON. Got: $RESPONSE"
echo "$ERROR_MSG"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null 2>&1; then
echo "Test failed: Message role is not 'assistant'"
echo "Got: $(echo "$RESPONSE" | jq '.choices[0].message.role')"
ERROR_MSG="Test failed: Message role is not 'assistant'. Got: $(echo "$RESPONSE" | jq '.choices[0].message.role')"
echo "$ERROR_MSG"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null 2>&1; then
echo "Test failed: Model name is incorrect"
echo "Got: $(echo "$RESPONSE" | jq '.model')"
ERROR_MSG="Test failed: Model name is incorrect. Expected: ${MODEL_NAME}, Got: $(echo "$RESPONSE" | jq '.model')"
echo "$ERROR_MSG"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.choices[0].message.content | length > 100' >/dev/null 2>&1; then
echo "Test failed: Response content length is not greater than 100 characters"
echo "Got length: $(echo "$RESPONSE" | jq '.choices[0].message.content | length')"
ERROR_MSG="Test failed: Response content length is not greater than 100 characters. Got length: $(echo "$RESPONSE" | jq '.choices[0].message.content | length')"
echo "$ERROR_MSG"
TEST_RESULT=1
else
echo "Test passed: Response matches expected format and content"
fi

if [ $TEST_RESULT -ne 0 ]; then
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
fi
exit $TEST_RESULT
continue-on-error: true

- name: Create GitHub Annotation on Failure
if: steps.run-tests.outcome == 'failure'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -x

# Extract error message from environment or test output log
ERROR_MESSAGE="${ERROR_MESSAGE:-Unknown error occurred during deployment test}"

# Get additional context from pod logs if available
if [ -n "$GRAPH_NAME" ] && [ -n "$NAMESPACE" ]; then
export KUBECONFIG=$(pwd)/.kubeconfig
POD_STATUS=$(kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $NAMESPACE -o json 2>/dev/null || echo "{}")

# Extract pod error information if any pods are not ready
POD_ERRORS=$(echo "$POD_STATUS" | jq -r '.items[] | select(.status.phase != "Running") | "Pod: \(.metadata.name), Status: \(.status.phase), Reason: \(.status.reason // "N/A")"' 2>/dev/null || echo "")

if [ -n "$POD_ERRORS" ]; then
ERROR_MESSAGE="$ERROR_MESSAGE\n\nPod Status:\n$POD_ERRORS"
fi
fi

# Create annotation using GitHub API
# Note: We use the workflow file path as the annotation target
cat > annotation.json <<EOF
{
"title": "Deployment Test Failed: ${{ env.FRAMEWORK }} (${{ matrix.profile }})",
"summary": "Deployment test failed for ${{ env.FRAMEWORK }} with profile ${{ matrix.profile }}",
"text": "**Job**: ${{ github.job }}\n**Framework**: ${{ env.FRAMEWORK }}\n**Profile**: ${{ matrix.profile }}\n**Namespace**: ${{ needs.deploy-operator.outputs.NAMESPACE }}\n\n**Error Details**:\n\`\`\`\n${ERROR_MESSAGE}\n\`\`\`\n\n[View Job Run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})",
"annotation_level": "failure",
"file": ".github/workflows/container-validation-backends.yml",
"start_line": 426,
"end_line": 618
}
Comment on lines +874 to +883
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Unused annotation.json file; hard-coded line numbers.

Lines 721-730 construct an annotation JSON file, but the subsequent API call (lines 734-756) uses an inline JSON payload instead. The annotation.json file is never used. Remove the dead code. Additionally, line numbers 426, 618 at lines 728-729 and 749-750 are hard-coded and will become stale.

Remove unused annotation.json construction and either:

  1. Use the file if intended: cat annotation.json | jq
  2. Or remove it entirely and rely on the inline payload

Also address the hard-coded line numbers as described in the deploy-operator annotation comment above.

🤖 Prompt for AI Agents
.github/workflows/container-validation-backends.yml lines 721-730: the script
builds an unused annotation.json and embeds hard-coded start_line/end_line
values; remove the dead cat > annotation.json block entirely (or if you intended
to use a file, replace the inline payload usage with piping annotation.json
through jq to the API), and eliminate or compute the hard-coded
"start_line"/"end_line" values (either remove those fields or derive them
dynamically from the job/context rather than using static numbers) so the
workflow no longer produces unused artifacts or stale line-number metadata.

EOF

# Create a check run with the annotation
CHECK_RUN_ID=$(curl -s -X POST \
-H "Authorization: token $GITHUB_TOKEN" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/check-runs" \
-d '{
"name": "Deploy Test: ${{ env.FRAMEWORK }} (${{ matrix.profile }})",
"head_sha": "${{ github.sha }}",
"status": "completed",
"conclusion": "failure",
"output": {
"title": "Deployment Test Failed: ${{ env.FRAMEWORK }} (${{ matrix.profile }})",
"summary": "Deployment test failed for ${{ env.FRAMEWORK }} with profile ${{ matrix.profile }}",
"text": "**Job**: ${{ github.job }}\n**Framework**: ${{ env.FRAMEWORK }}\n**Profile**: ${{ matrix.profile }}\n**Namespace**: ${{ needs.deploy-operator.outputs.NAMESPACE }}\n\n**Error Details**:\n```\n'"${ERROR_MESSAGE}"'\n```\n\n[View Job Run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})",
"annotations": [{
"path": ".github/workflows/container-validation-backends.yml",
"start_line": 426,
"end_line": 618,
"annotation_level": "failure",
"message": "'"${ERROR_MESSAGE}"'",
"title": "Deploy test failed for ${{ env.FRAMEWORK }} (${{ matrix.profile }})"
}]
}
}' | jq -r '.id')

echo "Created check run with ID: $CHECK_RUN_ID"

# Also use the workflow command to create an error annotation
echo "::error file=.github/workflows/container-validation-backends.yml,line=460,title=Deploy Test Failed: ${{ env.FRAMEWORK }} (${{ matrix.profile }})::${ERROR_MESSAGE}"

# Fail the step
exit 1
- name: Cleanup
if: always()
timeout-minutes: 5
Expand All @@ -721,7 +933,8 @@

deploy-test-sglang:
runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true'
# Temporarily override skip condition for testing - run even if dependencies fail
if: always() # TODO: Restore to: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, deploy-operator, sglang]
permissions:
contents: read
Expand All @@ -742,7 +955,8 @@

deploy-test-trtllm:
runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true'
# Temporarily override skip condition for testing - run even if dependencies fail
if: always() # TODO: Restore to: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, deploy-operator, trtllm]
permissions:
contents: read
Expand Down
Loading