cobaltcore-dev
diff --git a/‎Tiltfile‎
Lines changed: 0 additions & 3 deletions b/‎Tiltfile‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎helm/README.md‎
Lines changed: 6 additions & 2 deletions b/‎helm/README.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎helm/bundles/cortex-cinder/Chart.yaml‎
Lines changed: 0 additions & 4 deletions b/‎helm/bundles/cortex-cinder/Chart.yaml‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml‎
Lines changed: 100 additions & 6 deletions b/‎helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml‎
Lines changed: 100 additions & 6 deletions
diff --git a/‎helm/bundles/cortex-cinder/values.yaml‎
Lines changed: 0 additions & 6 deletions b/‎helm/bundles/cortex-cinder/values.yaml‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎helm/bundles/cortex-manila/Chart.yaml‎
Lines changed: 0 additions & 4 deletions b/‎helm/bundles/cortex-manila/Chart.yaml‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎helm/bundles/cortex-manila/alerts/manila.alerts.yaml‎
Lines changed: 100 additions & 7 deletions b/‎helm/bundles/cortex-manila/alerts/manila.alerts.yaml‎
Lines changed: 100 additions & 7 deletions
diff --git a/‎helm/bundles/cortex-manila/values.yaml‎
Lines changed: 0 additions & 6 deletions b/‎helm/bundles/cortex-manila/values.yaml‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎helm/bundles/cortex-nova/Chart.yaml‎
Lines changed: 0 additions & 4 deletions b/‎helm/bundles/cortex-nova/Chart.yaml‎
Lines changed: 0 additions & 4 deletions
@@ -51,17 +51,14 @@ dep_charts = {
         ('dist/chart', 'cortex'),
     ],
     'cortex-nova': [
-        ('helm/library/cortex-alerts', 'cortex-alerts'),
         ('helm/library/cortex-postgres', 'cortex-postgres'),
         ('dist/chart', 'cortex'),
     ],
     'cortex-manila': [
-        ('helm/library/cortex-alerts', 'cortex-alerts'),
         ('helm/library/cortex-postgres', 'cortex-postgres'),
         ('dist/chart', 'cortex'),
     ],
     'cortex-cinder': [
-        ('helm/library/cortex-alerts', 'cortex-alerts'),
         ('helm/library/cortex-postgres', 'cortex-postgres'),
         ('dist/chart', 'cortex'),
     ],
 
@@ -25,7 +25,6 @@ helm/
 │   ├── cortex-ironcore/         # IronCore scheduling domain
 │   └── cortex-crds/             # CRDs for all operators
 ├── library/                   # Shared library charts
-│   ├── cortex-alerts/           # Common alerting infrastructure
 │   └── cortex-postgres/         # PostgreSQL database
 ├── dev/                       # Development-only charts
 │   └── cortex-prometheus-operator/  # Local monitoring stack
@@ -39,6 +38,7 @@ helm/
 Bundle charts are **umbrella charts** that represent complete deployments for specific scheduling domains. They aggregate operator charts and library charts into deployable units.
 
 **Available bundles:**
+
 - `cortex-nova` - Nova compute scheduling domain
 - `cortex-cinder` - Cinder block storage scheduling domain
 - `cortex-manila` - Manila shared filesystem scheduling domain
@@ -54,10 +54,11 @@ The operator chart contains the core Kubernetes operators built from the Go modu
 Library charts provide **shared, reusable components** that are consumed by bundle charts as dependencies.
 
 **Available library charts:**
-- `cortex-alerts` - Common alerting infrastructure and templates
+
 - `cortex-postgres` - PostgreSQL database deployment with monitoring
 
 **Integration with bundles:**
+
 - Library charts are **included as dependencies** in bundle Chart.yaml files
 - Provide common infrastructure components used across multiple domains
 - Reduce duplication of common services like databases and monitoring
@@ -68,15 +69,18 @@ Library charts provide **shared, reusable components** that are consumed by bund
 Dev charts support **local development and testing** but are not included in production releases.
 
 **Available dev charts:**
+
 - `cortex-prometheus-operator` - Prometheus operator setup for local development
 
 ## Usage Patterns
 
 ### Production Deployment
+
 1. Deploy CRDs first: `helm install cortex-crds bundles/cortex-crds/`
 2. Deploy domain-specific bundle: `helm install cortex-nova bundles/cortex-nova/`
 
 ### Development Setup
+
 1. Deploy monitoring: `helm install prometheus dev/cortex-prometheus-operator/`
 2. Deploy CRDs: `helm install cortex-crds bundles/cortex-crds/`
 3. Deploy and test bundles: `helm install cortex-nova bundles/cortex-nova/`
 
@@ -8,10 +8,6 @@ type: application
 version: 0.0.10
 appVersion: 0.1.0
 dependencies:
-  # from: file://../../library/cortex-alerts
-  - name: cortex-alerts
-    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
-    version: 0.0.1
   # from: file://../../library/cortex-postgres
   - name: cortex-postgres
     repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
 
@@ -1,10 +1,10 @@
 groups:
 - name: cortex-cinder-alerts
   rules:
-  - alert: CortexCinderInitialPlacementDown
+  - alert: CortexCinderSchedulingDown
     expr: |
-      up{component="cortex-cinder-scheduler", namespace="cortex-cinder"} != 1 or
-      absent(up{component="cortex-cinder-scheduler", namespace="cortex-cinder"})
+      up{pod=~"cortex-cinder-scheduling-.*"} != 1 or
+      absent(up{pod=~"cortex-cinder-scheduling-.*"})
     for: 5m
     labels:
       context: liveness
@@ -14,8 +14,102 @@ groups:
       support_group: workload-management
       playbook: docs/support/playbook/cortex/down
     annotations:
-      summary: "Cortex initial placement for Cinder is down"
+      summary: "Cortex Scheduling for Cinder is down"
       description: >
-        The Cortex initial placement is down. Initial placement requests from Cinder will
+        The Cortex scheduling service is down. Scheduling requests from Cinder will
         not be served. This is no immediate problem, since Cinder will continue
-        placing new volumes. However, the placement will be less desirable.
+        placing new VMs. However, the placement will be less desirable.
+  - alert: CortexCinderKnowledgeDown
+    expr: |
+      up{pod=~"cortex-cinder-knowledge-.*"} != 1 or
+      absent(up{pod=~"cortex-cinder-knowledge-.*"})
+    for: 5m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+      playbook: docs/support/playbook/cortex/down
+    annotations:
+      summary: "Cortex Knowledge for Cinder is down"
+      description: >
+        The Cortex Knowledge service is down. This is no immediate problem,
+        since cortex is still able to process requests,
+        but the quality of the responses may be affected.
+  - alert: CortexCinderHttpRequest400sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cinder Scheduler HTTP request 400 errors too high"
+      description: >
+        Cinder Scheduler is responding to placement requests with HTTP 4xx
+        errors. This is expected when the scheduling request cannot be served
+        by Cortex. However, it could also indicate that the request format has
+        changed and Cortex is unable to parse it.
+  - alert: CortexCinderSchedulingHttpRequest500sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Cinder Scheduler HTTP request 500 errors too high"
+      description: >
+        Cinder Scheduler is responding to placement requests with HTTP 5xx errors.
+        This is not expected and indicates that Cortex is having some internal problem.
+        Cinder will continue to place new VMs, but the placement will be less desirable.
+        Thus, no immediate action is needed.
+  - alert: CortexCinderHighMemoryUsage
+    expr: process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024
+    for: 5m
+    labels:
+      context: memory
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much memory"
+      description: >
+        `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
+        should use much less, so there may be a memory leak or other changes
+        that are causing the memory usage to increase significantly.
+  - alert: CortexCinderHighCPUUsage
+    expr: rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5
+    for: 5m
+    labels:
+      context: cpu
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much CPU"
+      description: >
+        `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
+        it should use much less, so there may be a CPU leak or other changes
+        that are causing the CPU usage to increase significantly.
+  - alert: CortexCinderTooManyDBConnectionAttempts
+    expr: rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: db
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` is trying to connect to the database too often"
+      description: >
+        `{{$labels.component}}` is trying to connect to the database too often. This may happen
+        when the database is down or the connection parameters are misconfigured.
@@ -111,9 +111,3 @@ cortex-knowledge-controllers:
 # Custom configuration for the cortex postgres chart.
 cortex-postgres:
   fullnameOverride: cortex-cinder-postgresql
-
-# Custom configuration for the cortex core chart.
-cortex-alerts:
-  fullnameOverride: cortex-cinder
-  alerts:
-    componentPrefix: cortex-cinder
@@ -8,10 +8,6 @@ type: application
 version: 0.0.10
 appVersion: 0.1.0
 dependencies:
-  # from: file://../../library/cortex-alerts
-  - name: cortex-alerts
-    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
-    version: 0.0.1
   # from: file://../../library/cortex-postgres
   - name: cortex-postgres
     repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
 
@@ -1,10 +1,10 @@
 groups:
 - name: cortex-manila-alerts
   rules:
-  - alert: CortexManilaInitialPlacementDown
+  - alert: CortexManilaSchedulingDown
     expr: |
-      up{component="cortex-manila-scheduler", namespace="cortex-manila"} != 1 or
-      absent(up{component="cortex-manila-scheduler", namespace="cortex-manila"})
+      up{pod=~"cortex-manila-scheduling-.*"} != 1 or
+      absent(up{pod=~"cortex-manila-scheduling-.*"})
     for: 5m
     labels:
       context: liveness
@@ -14,9 +14,102 @@ groups:
       support_group: workload-management
       playbook: docs/support/playbook/cortex/down
     annotations:
-      summary: "Cortex initial placement for Manila is down"
+      summary: "Cortex Scheduling for Manila is down"
       description: >
-        The Cortex initial placement is down. Initial placement requests from Manila will
+        The Cortex scheduling service is down. Scheduling requests from Manila will
         not be served. This is no immediate problem, since Manila will continue
-        placing new shares. However, the placement will be less desirable.
-
+        placing new VMs. However, the placement will be less desirable.
+  - alert: CortexManilaKnowledgeDown
+    expr: |
+      up{pod=~"cortex-manila-knowledge-.*"} != 1 or
+      absent(up{pod=~"cortex-manila-knowledge-.*"})
+    for: 5m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+      playbook: docs/support/playbook/cortex/down
+    annotations:
+      summary: "Cortex Knowledge for Manila is down"
+      description: >
+        The Cortex Knowledge service is down. This is no immediate problem,
+        since cortex is still able to process requests,
+        but the quality of the responses may be affected.
+  - alert: CortexManilaHttpRequest400sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"4.+"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Manila Scheduler HTTP request 400 errors too high"
+      description: >
+        Manila Scheduler is responding to placement requests with HTTP 4xx
+        errors. This is expected when the scheduling request cannot be served
+        by Cortex. However, it could also indicate that the request format has
+        changed and Cortex is unable to parse it.
+  - alert: CortexManilaSchedulingHttpRequest500sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"5.+" }[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Manila Scheduler HTTP request 500 errors too high"
+      description: >
+        Manila Scheduler is responding to placement requests with HTTP 5xx errors.
+        This is not expected and indicates that Cortex is having some internal problem.
+        Manila will continue to place new VMs, but the placement will be less desirable.
+        Thus, no immediate action is needed.
+  - alert: CortexManilaHighMemoryUsage
+    expr: process_resident_memory_bytes{service="cortex-manila-metrics"} > 6000 * 1024 * 1024
+    for: 5m
+    labels:
+      context: memory
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much memory"
+      description: >
+        `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
+        should use much less, so there may be a memory leak or other changes
+        that are causing the memory usage to increase significantly.
+  - alert: CortexManilaHighCPUUsage
+    expr: rate(process_cpu_seconds_total{service="cortex-manila-metrics"}[1m]) > 0.5
+    for: 5m
+    labels:
+      context: cpu
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much CPU"
+      description: >
+        `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
+        it should use much less, so there may be a CPU leak or other changes
+        that are causing the CPU usage to increase significantly.
+  - alert: CortexManilaTooManyDBConnectionAttempts
+    expr: rate(cortex_db_connection_attempts_total{service="cortex-manila-metrics"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: db
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` is trying to connect to the database too often"
+      description: >
+        `{{$labels.component}}` is trying to connect to the database too often. This may happen
+        when the database is down or the connection parameters are misconfigured.
@@ -111,9 +111,3 @@ cortex-knowledge-controllers:
 # Custom configuration for the cortex postgres chart.
 cortex-postgres:
   fullnameOverride: cortex-manila-postgresql
-
-# Custom configuration for the cortex core chart.
-cortex-alerts:
-  fullnameOverride: cortex-manila
-  alerts:
-    componentPrefix: cortex-manila
@@ -8,10 +8,6 @@ type: application
 version: 0.0.10
 appVersion: 0.1.0
 dependencies:
-  # from: file://../../library/cortex-alerts
-  - name: cortex-alerts
-    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
-    version: 0.0.1
   # from: file://../../library/cortex-postgres
   - name: cortex-postgres
     repository: oci://ghcr.io/cobaltcore-dev/cortex/charts