From 65c4f5b64ce4d305d6b9a9a0d3d671d74d93e862 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Thu, 6 Nov 2025 10:58:32 -0800 Subject: [PATCH 01/17] refactor: consolidate observability stack into unified docker-observability.yml - Moved metrics (Prometheus, Grafana, DCGM, NATS exporter) and tracing (Tempo) into single docker-observability.yml - Simplified docker-compose.yml to only include core infrastructure (NATS, etcd) - Reorganized observability files: deploy/metrics/* and deploy/tracing/* -> deploy/observability/* - Updated documentation: deploy/tracing/README.md -> docs/observability/tracing.md - Unified Grafana configuration to support both Prometheus and Tempo datasources - Single observability stack now runs on unified 'server' network for better integration Signed-off-by: Keiven Chang --- deploy/docker-compose.yml | 125 +--------------- deploy/docker-observability.yml | 137 ++++++++++++++++++ .../grafana-datasources.yml | 0 .../grafana-dashboard-providers.yml | 0 .../grafana-dcgm-metrics.json | 0 .../grafana-dynamo-dashboard.json | 0 .../grafana-kvbm-dashboard.json | 0 .../{metrics => observability}/k8s/README.md | 0 .../k8s/frontend-podmonitor.yaml | 0 .../grafana-dynamo-dashboard-configmap.yaml | 0 .../k8s/planner-podmonitor.yaml | 0 .../k8s/worker-podmonitor.yaml | 0 .../{metrics => observability}/prometheus.yml | 0 .../tempo-datasource.yml} | 2 +- deploy/{tracing => observability}/tempo.yaml | 0 deploy/{tracing => observability}/trace.png | Bin deploy/tracing/docker-compose.yml | 35 ----- .../observability/tracing.md | 19 ++- 18 files changed, 153 insertions(+), 165 deletions(-) create mode 100644 deploy/docker-observability.yml rename deploy/{metrics => observability}/grafana-datasources.yml (100%) rename deploy/{metrics => observability}/grafana_dashboards/grafana-dashboard-providers.yml (100%) rename deploy/{metrics => observability}/grafana_dashboards/grafana-dcgm-metrics.json (100%) rename deploy/{metrics => observability}/grafana_dashboards/grafana-dynamo-dashboard.json (100%) rename deploy/{metrics => observability}/grafana_dashboards/grafana-kvbm-dashboard.json (100%) rename deploy/{metrics => observability}/k8s/README.md (100%) rename deploy/{metrics => observability}/k8s/frontend-podmonitor.yaml (100%) rename deploy/{metrics => observability}/k8s/grafana-dynamo-dashboard-configmap.yaml (100%) rename deploy/{metrics => observability}/k8s/planner-podmonitor.yaml (100%) rename deploy/{metrics => observability}/k8s/worker-podmonitor.yaml (100%) rename deploy/{metrics => observability}/prometheus.yml (100%) rename deploy/{tracing/grafana/provisioning/datasources/tempo.yaml => observability/tempo-datasource.yml} (96%) rename deploy/{tracing => observability}/tempo.yaml (100%) rename deploy/{tracing => observability}/trace.png (100%) delete mode 100644 deploy/tracing/docker-compose.yml rename deploy/tracing/README.md => docs/observability/tracing.md (89%) diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 2b19741f7b..75b2ff1537 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -1,26 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml +# Bare minimum infrastructure services for Dynamo. +# For observability (metrics, tracing, dashboards), use docker-compose-observability.yml + networks: server: driver: bridge - monitoring: - driver: bridge -# Note that the images are pinned to specific versions to avoid breaking changes. services: nats-server: image: nats:2.11.4 @@ -31,7 +18,6 @@ services: - 8222:8222 # the endpoints include /varz, /healthz, ... networks: - server - - monitoring etcd-server: image: bitnamilegacy/etcd:3.6.1 @@ -42,108 +28,3 @@ services: - 2380:2380 networks: - server - - monitoring - - # All the services below are part of the metrics profile and monitoring network. - - # The exporter translates from /varz and other stats to Prometheus metrics - nats-prometheus-exporter: - image: natsio/prometheus-nats-exporter:0.17.3 - command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"] - ports: - - 7777:7777 - networks: - - monitoring - profiles: [metrics] - depends_on: - - nats-server - - # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm - # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format. - dcgm-exporter: - image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9 - ports: - # Expose dcgm-exporter on port 9401 both inside and outside the container - # to avoid conflicts with other dcgm-exporter instances in distributed environments. - # To access DCGM metrics: - # Outside the container: curl http://localhost:9401/metrics (or the host IP) - # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics - - 9401:9401 - cap_add: - - SYS_ADMIN - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - environment: - # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES - - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all} - - DCGM_EXPORTER_LISTEN=:9401 - runtime: nvidia # Specify the NVIDIA runtime - networks: - - monitoring - - # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu: - # sudo ufw allow 9090/tcp - prometheus: - image: prom/prometheus:v3.4.1 - container_name: prometheus - volumes: - - ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - # These provide the web console functionality - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--web.enable-lifecycle' - restart: unless-stopped - # Example to pull from the /query endpoint: - # {__name__=~"DCGM.*", job="dcgm-exporter"} - networks: - - monitoring - ports: - - "9090:9090" - profiles: [metrics] - extra_hosts: - - "host.docker.internal:host-gateway" - depends_on: - - dcgm-exporter - - nats-prometheus-exporter - - etcd-server - - # grafana connects to prometheus via the /query endpoint. - # Default credentials are dynamo/dynamo. - # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu: - # sudo ufw allow 3001/tcp - grafana: - image: grafana/grafana-enterprise:12.0.1 - container_name: grafana - volumes: - - ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards - - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml - environment: - - GF_SERVER_HTTP_PORT=3001 - # do not make it admin/admin, because you will be prompted to change the password every time - - GF_SECURITY_ADMIN_USER=dynamo - - GF_SECURITY_ADMIN_PASSWORD=dynamo - - GF_USERS_ALLOW_SIGN_UP=false - - GF_INSTALL_PLUGINS=grafana-piechart-panel - # Default min interval is 5s, but can be configured lower - - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s - # Disable password change requirement - - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false - - GF_SECURITY_ADMIN_PASSWORD_POLICY=false - - GF_AUTH_DISABLE_LOGIN_FORM=false - - GF_AUTH_DISABLE_SIGNOUT_MENU=false - restart: unless-stopped - ports: - - "3001:3001" - networks: - - monitoring - profiles: [metrics] - depends_on: - - prometheus diff --git a/deploy/docker-observability.yml b/deploy/docker-observability.yml new file mode 100644 index 0000000000..b8e57aa6c3 --- /dev/null +++ b/deploy/docker-observability.yml @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Observability stack for Dynamo: metrics, tracing, and visualization. +# Requires deploy/docker-compose.yml to be running for NATS and etcd connectivity. +# +# Usage: +# docker compose -f deploy/docker-observability.yml up -d + +version: '3.8' + +networks: + server: + external: true + name: deploy_server + +volumes: + grafana-data: + tempo-data: + +services: + # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm + # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format. + dcgm-exporter: + image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9 + ports: + # Expose dcgm-exporter on port 9401 both inside and outside the container + # to avoid conflicts with other dcgm-exporter instances in distributed environments. + # To access DCGM metrics: + # Outside the container: curl http://localhost:9401/metrics (or the host IP) + # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics + - 9401:9401 + cap_add: + - SYS_ADMIN + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + environment: + # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES + - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all} + - DCGM_EXPORTER_LISTEN=:9401 + runtime: nvidia # Specify the NVIDIA runtime + networks: + - server + + # The exporter translates from /varz and other stats to Prometheus metrics + nats-prometheus-exporter: + image: natsio/prometheus-nats-exporter:0.17.3 + command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"] + ports: + - 7777:7777 + networks: + - server + + # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu: + # sudo ufw allow 9090/tcp + prometheus: + image: prom/prometheus:v3.4.1 + container_name: prometheus + volumes: + - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + # These provide the web console functionality + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + restart: unless-stopped + # Example to pull from the /query endpoint: + # {__name__=~"DCGM.*", job="dcgm-exporter"} + ports: + - "9090:9090" + networks: + - server + extra_hosts: + - "host.docker.internal:host-gateway" + depends_on: + - dcgm-exporter + - nats-prometheus-exporter + + # Tempo - Distributed tracing backend + tempo: + image: grafana/tempo:2.8.2 + command: [ "-config.file=/etc/tempo.yaml" ] + user: root + volumes: + - ./observability/tempo.yaml:/etc/tempo.yaml + - tempo-data:/tmp/tempo + ports: + - "3200:3200" # Tempo HTTP + - "4317:4317" # OTLP gRPC receiver (accessible from host) + - "4318:4318" # OTLP HTTP receiver (accessible from host) + networks: + - server + + # Grafana - Visualization and dashboards + # Supports both Prometheus (metrics) and Tempo (tracing) datasources + # Default credentials: dynamo/dynamo + # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu: + # sudo ufw allow 3000/tcp + grafana: + image: grafana/grafana:12.2.0 + container_name: grafana + volumes: + - grafana-data:/var/lib/grafana + - ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards + - ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml + - ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml + environment: + - GF_SERVER_HTTP_PORT=3000 + # do not make it admin/admin, because you will be prompted to change the password every time + - GF_SECURITY_ADMIN_USER=dynamo + - GF_SECURITY_ADMIN_PASSWORD=dynamo + - GF_USERS_ALLOW_SIGN_UP=false + - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor + - GF_INSTALL_PLUGINS=grafana-piechart-panel + # Default min interval is 5s, but can be configured lower + - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s + # Disable password change requirement + - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false + - GF_SECURITY_ADMIN_PASSWORD_POLICY=false + - GF_AUTH_DISABLE_LOGIN_FORM=false + - GF_AUTH_DISABLE_SIGNOUT_MENU=false + restart: unless-stopped + ports: + - "3000:3000" + networks: + - server + depends_on: + - prometheus + - tempo + diff --git a/deploy/metrics/grafana-datasources.yml b/deploy/observability/grafana-datasources.yml similarity index 100% rename from deploy/metrics/grafana-datasources.yml rename to deploy/observability/grafana-datasources.yml diff --git a/deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml b/deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml similarity index 100% rename from deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml rename to deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml diff --git a/deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json b/deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json similarity index 100% rename from deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json rename to deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json diff --git a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json b/deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json similarity index 100% rename from deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json rename to deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json diff --git a/deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json b/deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json similarity index 100% rename from deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json rename to deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json diff --git a/deploy/metrics/k8s/README.md b/deploy/observability/k8s/README.md similarity index 100% rename from deploy/metrics/k8s/README.md rename to deploy/observability/k8s/README.md diff --git a/deploy/metrics/k8s/frontend-podmonitor.yaml b/deploy/observability/k8s/frontend-podmonitor.yaml similarity index 100% rename from deploy/metrics/k8s/frontend-podmonitor.yaml rename to deploy/observability/k8s/frontend-podmonitor.yaml diff --git a/deploy/metrics/k8s/grafana-dynamo-dashboard-configmap.yaml b/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml similarity index 100% rename from deploy/metrics/k8s/grafana-dynamo-dashboard-configmap.yaml rename to deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml diff --git a/deploy/metrics/k8s/planner-podmonitor.yaml b/deploy/observability/k8s/planner-podmonitor.yaml similarity index 100% rename from deploy/metrics/k8s/planner-podmonitor.yaml rename to deploy/observability/k8s/planner-podmonitor.yaml diff --git a/deploy/metrics/k8s/worker-podmonitor.yaml b/deploy/observability/k8s/worker-podmonitor.yaml similarity index 100% rename from deploy/metrics/k8s/worker-podmonitor.yaml rename to deploy/observability/k8s/worker-podmonitor.yaml diff --git a/deploy/metrics/prometheus.yml b/deploy/observability/prometheus.yml similarity index 100% rename from deploy/metrics/prometheus.yml rename to deploy/observability/prometheus.yml diff --git a/deploy/tracing/grafana/provisioning/datasources/tempo.yaml b/deploy/observability/tempo-datasource.yml similarity index 96% rename from deploy/tracing/grafana/provisioning/datasources/tempo.yaml rename to deploy/observability/tempo-datasource.yml index 388c461371..14efa7c770 100644 --- a/deploy/tracing/grafana/provisioning/datasources/tempo.yaml +++ b/deploy/observability/tempo-datasource.yml @@ -9,7 +9,7 @@ datasources: access: proxy url: http://tempo:3200 uid: tempo - isDefault: true + isDefault: false editable: true jsonData: httpMethod: GET diff --git a/deploy/tracing/tempo.yaml b/deploy/observability/tempo.yaml similarity index 100% rename from deploy/tracing/tempo.yaml rename to deploy/observability/tempo.yaml diff --git a/deploy/tracing/trace.png b/deploy/observability/trace.png similarity index 100% rename from deploy/tracing/trace.png rename to deploy/observability/trace.png diff --git a/deploy/tracing/docker-compose.yml b/deploy/tracing/docker-compose.yml deleted file mode 100644 index 16a5f0657d..0000000000 --- a/deploy/tracing/docker-compose.yml +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -version: '3.8' - -services: - # Tempo - Distributed tracing backend - tempo: - image: grafana/tempo:2.8.2 - command: [ "-config.file=/etc/tempo.yaml" ] - volumes: - - ./tempo.yaml:/etc/tempo.yaml - - tempo-data:/tmp/tempo - ports: - - "3200:3200" # Tempo HTTP - - "4317:4317" # OTLP gRPC receiver (accessible from host) - - "4318:4318" # OTLP HTTP receiver (accessible from host) - - # Grafana - Visualization and dashboards - grafana: - image: grafana/grafana:12.2.0 - ports: - - "3000:3000" - environment: - - GF_SECURITY_ADMIN_PASSWORD=admin - - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor - volumes: - - grafana-data:/var/lib/grafana - - ./grafana/provisioning:/etc/grafana/provisioning - depends_on: - - tempo - -volumes: - tempo-data: - grafana-data: diff --git a/deploy/tracing/README.md b/docs/observability/tracing.md similarity index 89% rename from deploy/tracing/README.md rename to docs/observability/tracing.md index a2efa75bd5..fcbe194a3d 100644 --- a/deploy/tracing/README.md +++ b/docs/observability/tracing.md @@ -7,6 +7,8 @@ SPDX-License-Identifier: Apache-2.0 This guide explains how to set up and view distributed traces in Grafana Tempo for Dynamo workloads. +> **💡 Note:** For local development, use the unified observability stack at `../../deploy/docker-observability.yml`, which includes Tempo, Prometheus, Grafana, and metric exporters in one convenient stack. + ## Overview Dynamo supports OpenTelemetry-based distributed tracing, allowing you to visualize request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana. @@ -19,7 +21,7 @@ Dynamo supports OpenTelemetry-based distributed tracing, allowing you to visuali ## Environment Variables -Dynamo's tracing is configured via environment variables. For complete logging documentation, see [docs/observability/logging.md](../../docs/observability/logging.md). +Dynamo's tracing is configured via environment variables. For complete logging documentation, see [logging.md](./logging.md). ### Required Environment Variables @@ -52,23 +54,26 @@ export OTEL_SERVICE_NAME=dynamo-frontend ## Local Deployment with Docker Compose -### 1. Start Tempo and Grafana +### 1. Start the Unified Observability Stack -From the `deploy/tracing` directory, start the observability stack: +From the `deploy` directory, start the unified observability stack: ```bash -cd deploy/tracing -docker-compose up -d +cd deploy +docker compose -f docker-observability.yml up -d ``` This will start: - **Tempo** on `http://localhost:3200` (HTTP API) and `localhost:4317` (OTLP gRPC) -- **Grafana** on `http://localhost:3000` (username: `admin`, password: `admin`) +- **Prometheus** on `http://localhost:9090` +- **Grafana** on `http://localhost:3000` (username: `dynamo`, password: `dynamo`) +- **DCGM Exporter** on `http://localhost:9401/metrics` (GPU metrics) +- **NATS Exporter** on `http://localhost:7777/metrics` Verify services are running: ```bash -docker-compose ps +docker compose -f docker-observability.yml ps ``` ### 2. Set Environment Variables From 3b63e21aafa0564e71e4a19dd67e0e5d766891be Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Thu, 6 Nov 2025 22:11:16 +0000 Subject: [PATCH 02/17] Reorganize observability configs and improve tracing docs - Move deploy/logging to deploy/observability/k8s/logging for better organization - Move trace.png to docs/observability/ to be alongside tracing.md - Fix vllm lazy import of kvbm to avoid Tokio runtime initialization issues - Add log level documentation explaining DEBUG vs INFO for trace visibility - Update all references to reflect new paths - Clarify OTEL environment variable defaults and behavior Signed-off-by: Keiven Chang --- .../{ => observability/k8s}/logging/README.md | 2 +- .../k8s}/logging/grafana/dashboard.json | 0 .../logging/grafana/logging-dashboard.yaml | 0 .../k8s}/logging/grafana/loki-datasource.yaml | 0 .../k8s}/logging/values/alloy-values.yaml | 0 .../k8s}/logging/values/loki-values.yaml | 0 docs/kubernetes/observability/logging.md | 10 +- docs/kubernetes/observability/metrics.md | 4 +- {deploy => docs}/observability/trace.png | Bin docs/observability/tracing.md | 88 +++++++++++++----- 10 files changed, 74 insertions(+), 30 deletions(-) rename deploy/{ => observability/k8s}/logging/README.md (75%) rename deploy/{ => observability/k8s}/logging/grafana/dashboard.json (100%) rename deploy/{ => observability/k8s}/logging/grafana/logging-dashboard.yaml (100%) rename deploy/{ => observability/k8s}/logging/grafana/loki-datasource.yaml (100%) rename deploy/{ => observability/k8s}/logging/values/alloy-values.yaml (100%) rename deploy/{ => observability/k8s}/logging/values/loki-values.yaml (100%) rename {deploy => docs}/observability/trace.png (100%) diff --git a/deploy/logging/README.md b/deploy/observability/k8s/logging/README.md similarity index 75% rename from deploy/logging/README.md rename to deploy/observability/k8s/logging/README.md index 2423989d99..85634e5273 100644 --- a/deploy/logging/README.md +++ b/deploy/observability/k8s/logging/README.md @@ -1,3 +1,3 @@ # Dynamo Logging on Kubernetes -For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../docs/kubernetes/observability/logging.md). +For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../../../docs/kubernetes/observability/logging.md). diff --git a/deploy/logging/grafana/dashboard.json b/deploy/observability/k8s/logging/grafana/dashboard.json similarity index 100% rename from deploy/logging/grafana/dashboard.json rename to deploy/observability/k8s/logging/grafana/dashboard.json diff --git a/deploy/logging/grafana/logging-dashboard.yaml b/deploy/observability/k8s/logging/grafana/logging-dashboard.yaml similarity index 100% rename from deploy/logging/grafana/logging-dashboard.yaml rename to deploy/observability/k8s/logging/grafana/logging-dashboard.yaml diff --git a/deploy/logging/grafana/loki-datasource.yaml b/deploy/observability/k8s/logging/grafana/loki-datasource.yaml similarity index 100% rename from deploy/logging/grafana/loki-datasource.yaml rename to deploy/observability/k8s/logging/grafana/loki-datasource.yaml diff --git a/deploy/logging/values/alloy-values.yaml b/deploy/observability/k8s/logging/values/alloy-values.yaml similarity index 100% rename from deploy/logging/values/alloy-values.yaml rename to deploy/observability/k8s/logging/values/alloy-values.yaml diff --git a/deploy/logging/values/loki-values.yaml b/deploy/observability/k8s/logging/values/loki-values.yaml similarity index 100% rename from deploy/logging/values/loki-values.yaml rename to deploy/observability/k8s/logging/values/loki-values.yaml diff --git a/docs/kubernetes/observability/logging.md b/docs/kubernetes/observability/logging.md index 0784cf05c7..68cba5604d 100644 --- a/docs/kubernetes/observability/logging.md +++ b/docs/kubernetes/observability/logging.md @@ -46,7 +46,7 @@ helm repo add grafana https://grafana.github.io/helm-charts helm repo update # Install Loki -helm install --values deploy/logging/values/loki-values.yaml loki grafana/loki -n $MONITORING_NAMESPACE +helm install --values deploy/observability/k8s/logging/values/loki-values.yaml loki grafana/loki -n $MONITORING_NAMESPACE ``` Our configuration (`loki-values.yaml`) sets up Loki in a simple configuration that is suitable for testing and development. It uses a local MinIO for storage. The installation pods can be viewed with: @@ -60,7 +60,7 @@ Next, install the Grafana Alloy collector to gather logs from your Kubernetes cl ```bash # Generate a custom values file with the namespace information -envsubst < deploy/logging/values/alloy-values.yaml > alloy-custom-values.yaml +envsubst < deploy/observability/k8s/logging/values/alloy-values.yaml > alloy-custom-values.yaml # Install the collector helm install --values alloy-custom-values.yaml alloy grafana/k8s-monitoring -n $MONITORING_NAMESPACE @@ -110,10 +110,10 @@ Since we are using Grafana with the Prometheus Operator, we can simply apply the ```bash # Configure Grafana with the Loki datasource -envsubst < deploy/logging/grafana/loki-datasource.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - +envsubst < deploy/observability/k8s/logging/grafana/loki-datasource.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - # Configure Grafana with the Dynamo Logs dashboard -envsubst < deploy/logging/grafana/logging-dashboard.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - +envsubst < deploy/observability/k8s/logging/grafana/logging-dashboard.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - ``` > [!Note] @@ -141,4 +141,4 @@ kubectl port-forward svc/prometheus-grafana 3000:80 -n $MONITORING_NAMESPACE If everything is working, under Home > Dashboards > Dynamo Logs, you should see a dashboard that can be used to view the logs associated with our DynamoGraphDeployments -The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g frontend, worker, etc). \ No newline at end of file +The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g frontend, worker, etc). diff --git a/docs/kubernetes/observability/metrics.md b/docs/kubernetes/observability/metrics.md index e03ec3efeb..f8d6f8696b 100644 --- a/docs/kubernetes/observability/metrics.md +++ b/docs/kubernetes/observability/metrics.md @@ -128,9 +128,7 @@ spec: Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard: ```bash -pushd deploy/metrics/k8s -kubectl apply -n monitoring -f grafana-dynamo-dashboard-configmap.yaml -popd +kubectl apply -n monitoring -f deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml ``` The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_dashboard: "1"`, the Grafana will discover and populate it to its list of available dashboards. The dashboard includes panels for: diff --git a/deploy/observability/trace.png b/docs/observability/trace.png similarity index 100% rename from deploy/observability/trace.png rename to docs/observability/trace.png diff --git a/docs/observability/tracing.md b/docs/observability/tracing.md index fcbe194a3d..4ca5cc10fd 100644 --- a/docs/observability/tracing.md +++ b/docs/observability/tracing.md @@ -7,7 +7,9 @@ SPDX-License-Identifier: Apache-2.0 This guide explains how to set up and view distributed traces in Grafana Tempo for Dynamo workloads. -> **💡 Note:** For local development, use the unified observability stack at `../../deploy/docker-observability.yml`, which includes Tempo, Prometheus, Grafana, and metric exporters in one convenient stack. +> **Note:** This guide covers local single-instance deployments using Docker Compose for demonstration purposes. For distributed Kubernetes deployments, see the [Kubernetes Deployment](#kubernetes-deployment) section. + +> **💡 Tip:** For local development, use the unified observability stack at `../../deploy/docker-observability.yml`, which includes Tempo, Prometheus, Grafana, and metric exporters in one convenient stack. ## Overview @@ -25,15 +27,39 @@ Dynamo's tracing is configured via environment variables. For complete logging d ### Required Environment Variables -| Variable | Description | Example Value | -|----------|-------------|---------------| -| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `true` | -| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `1` | -| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` (local) or `http://tempo:4317` (docker) | -| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo-frontend`, `dynamo-worker-prefill`, `dynamo-worker-decode` | +| Variable | Description | Default | Example Value | +|----------|-------------|---------|---------------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `false` | `true` | +| `DYN_LOG` | Log level (info or debug). Use debug to see detailed traces. | `info` | `debug` | +| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | disabled | `1` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` | `http://tempo:4317` (docker) | +| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo` | `dynamo-frontend`, `dynamo-worker-prefill`, `dynamo-worker-decode` | + +### Tracing Behavior + +- **When `OTEL_EXPORT_ENABLED` is NOT set**: Tracing is disabled. Traces are generated locally for trace logging, but not exported to any backend. +- **When `OTEL_EXPORT_ENABLED=1`**: Traces are exported via OTLP to the endpoint specified by `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`. + - If `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` is NOT set, it defaults to `http://localhost:4317` + - If `OTEL_SERVICE_NAME` is NOT set, it defaults to `dynamo` **Note:** When `OTEL_EXPORT_ENABLED=1`, logging initialization is deferred until the runtime is available (required by the OTEL exporter). This means some early logs will be dropped. This will be fixed in a future release. +### Log Levels and Visibility + +Traces are exported to Tempo/Grafana regardless of log level. Set `DYN_LOG` only if you want to view trace details in console logs for debugging. + +JSONL logs are written to **stderr**: + +- **INFO (default)**: Initialization, errors, high-level operations +- **DEBUG**: Detailed traces with trace IDs, span IDs, and routing decisions + +Example: +```bash +export DYN_LOGGING_JSONL=true +export DYN_LOG=debug +python -m dynamo.frontend --http-port 8000 +``` + ### Example Configuration ```bash @@ -43,8 +69,8 @@ export DYN_LOGGING_JSONL=true # Enable trace export to Tempo export OTEL_EXPORT_ENABLED=1 -# Set the Tempo endpoint (docker-compose network) -export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317 +# Set the Tempo endpoint (docker-compose network). Note that if this is not specified, it will default to http://localhost:4317 +export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo-host-name-here:4317 # Set service name to identify this component export OTEL_SERVICE_NAME=dynamo-frontend @@ -54,6 +80,8 @@ export OTEL_SERVICE_NAME=dynamo-frontend ## Local Deployment with Docker Compose +> **Note:** The following Docker Compose commands are for demonstrating single-instance local deployments, not distributed Kubernetes environments. + ### 1. Start the Unified Observability Stack From the `deploy` directory, start the unified observability stack: @@ -85,12 +113,27 @@ Configure Dynamo components to export traces: export DYN_LOGGING_JSONL=true export OTEL_EXPORT_ENABLED=1 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317 +``` + +### 3a. Quick Start: Single GPU Aggregated Deployment + +For a simple single-GPU deployment, start the frontend and a single vLLM worker: -# Set service names for each component +```bash +# Start the frontend with tracing enabled export OTEL_SERVICE_NAME=dynamo-frontend +python -m dynamo.frontend --router-mode kv --http-port=8000 & + +# Start a single vLLM worker (aggregated prefill and decode) +export OTEL_SERVICE_NAME=dynamo-worker-vllm +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & + +wait ``` -### 3. Run vLLM Disaggregated Deployment +This runs both prefill and decode on the same GPU, providing a simpler setup for testing tracing. + +### 3b. Run vLLM Disaggregated Deployment (2 GPUs) Run the vLLM disaggregated script with tracing enabled: @@ -132,9 +175,11 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ wait ``` +For disaggregated deployments, this separates prefill and decode onto different GPUs for better resource utilization. + ### 4. Generate Traces -Send requests to the frontend to generate traces: +Send requests to the frontend to generate traces (works for both aggregated and disaggregated deployments): ```bash curl -d '{ @@ -149,31 +194,32 @@ curl -d '{ http://localhost:8000/v1/chat/completions ``` -### 5. View Traces in Grafana Tempo +### 6. View Traces in Grafana Tempo 1. Open Grafana at `http://localhost:3000` -2. Login with username `admin` and password `admin` +2. Login with username `dynamo` and password `dynamo` 3. Navigate to **Explore** (compass icon in the left sidebar) 4. Select **Tempo** as the data source (should be selected by default) -5. Use the **Search** tab to find traces: +5. In the query type, select **"Search"** (not TraceQL, not Service Graph) +6. Use the **Search** tab to find traces: - Search by **Service Name** (e.g., `dynamo-frontend`) - Search by **Span Name** (e.g., `http-request`, `handle_payload`) - Search by **Tags** (e.g., `x_request_id=test-trace-001`) -6. Click on a trace to view the detailed flame graph +7. Click on a trace to view the detailed flame graph #### Example Trace View Below is an example of what a trace looks like in Grafana Tempo: -![Trace Example](./trace.png) +![Trace Example](trace.png) -### 6. Stop Services +### 7. Stop Services -When done, stop the Tempo and Grafana stack: +When done, stop the observability stack: ```bash -cd deploy/tracing -docker-compose down +cd deploy +docker compose -f docker-observability.yml down ``` --- From 0b71cf8463a800f6dbe9789813b5445354ca25d5 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 7 Nov 2025 01:43:20 +0000 Subject: [PATCH 03/17] Standardize observability docs structure and improve clarity - Create docs/observability/README.md as central hub - Split metrics-developer-guide.md from prometheus-grafana.md - Standardize all docs: Overview, Environment Variables, Getting Started - Update env variable parsing to accept truthy values (true/1/on/yes) - Consolidate prometheus-grafana.md as quick start guide - Improve metrics.md as reference document - Clarify tracing requirements and overlap with logging - Fix double space and grammatical issues Signed-off-by: Keiven Chang --- README.md | 13 +- .../grafana-dynamo-dashboard.json | 2 +- .../grafana-dynamo-dashboard-configmap.yaml | 2 +- docs/observability/README.md | 32 ++ docs/observability/health-checks.md | 46 +- docs/observability/logging.md | 115 ++-- docs/observability/metrics-developer-guide.md | 270 +++++++++ docs/observability/metrics.md | 210 +++++-- docs/observability/prometheus-grafana.md | 517 ++++-------------- docs/observability/tracing.md | 110 +--- .../python/examples/metrics/README.md | 418 +------------- lib/bindings/python/rust/lib.rs | 12 +- lib/runtime/src/logging.rs | 6 +- 13 files changed, 704 insertions(+), 1049 deletions(-) create mode 100644 docs/observability/README.md create mode 100644 docs/observability/metrics-developer-guide.md diff --git a/README.md b/README.md index 5a0576355c..5eb6ba557e 100644 --- a/README.md +++ b/README.md @@ -101,12 +101,21 @@ To coordinate across a data center, Dynamo relies on etcd and NATS. To run Dynam To quickly setup etcd & NATS, you can also run: -``` +```bash # At the root of the repository: -# Edit deploy/docker-compose.yml to comment out "runtime: nvidia" of the dcgm-exporter service if the nvidia container runtime isn't deployed or to be used. docker compose -f deploy/docker-compose.yml up -d ``` +### Optional: Observability Stack + +For monitoring with metrics (Prometheus, Grafana) and distributed tracing (Tempo), deploy the observability stack: + +```bash +docker compose -f deploy/docker-observability.yml up -d +``` + +This provides GPU metrics (DCGM), NATS metrics, Prometheus, and Grafana dashboards. Access Grafana at `http://localhost:3000` (username: `dynamo`, password: `dynamo`). + ## 2. Select an engine We publish Python wheels specialized for each of our supported engines: vllm, sglang, and trtllm. The examples that follow use SGLang; continue reading for other engines. diff --git a/deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json b/deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json index 76b822c6f9..1ef1abc7c1 100644 --- a/deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json +++ b/deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json @@ -1020,7 +1020,7 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Dynamo Dashboard", + "title": "Dynamo Dashboard (generic)", "uid": "97ae8df9-138a-4f7a-9b0f-635b77d818fe", "version": 1 } \ No newline at end of file diff --git a/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml b/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml index 0c4ed0c011..ee1088556b 100644 --- a/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml +++ b/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml @@ -1002,7 +1002,7 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Dynamo Dashboard", + "title": "Dynamo Dashboard (generic)", "uid": "dynamo-dashboard", "version": 1 } diff --git a/docs/observability/README.md b/docs/observability/README.md new file mode 100644 index 0000000000..12c71c335e --- /dev/null +++ b/docs/observability/README.md @@ -0,0 +1,32 @@ + + +# Dynamo Observability + +## Quick Start + +For a quick start guide to get Prometheus and Grafana running with Dynamo on a single machine, see [Prometheus + Grafana Setup](prometheus-grafana.md). + +## Observability Documentations + +| Guide | Description | Environment Variables to Control | +|-------|-------------|----------------------------------| +| [Metrics](metrics.md) | Available metrics reference | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`† | +| [Health Checks](health-checks.md) | Component health monitoring and readiness probes | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`†, `DYN_SYSTEM_STARTING_HEALTH_STATUS`, `DYN_SYSTEM_HEALTH_PATH`, `DYN_SYSTEM_LIVE_PATH`, `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | +| [Tracing](tracing.md) | Distributed tracing with OpenTelemetry and Tempo | `DYN_LOGGING_JSONL`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`†, `OTEL_SERVICE_NAME`† | +| [Logging](logging.md) | Structured logging configuration | `DYN_LOGGING_JSONL`†, `DYN_LOG`, `DYN_LOG_USE_LOCAL_TZ`, `DYN_LOGGING_CONFIG_PATH`, `OTEL_SERVICE_NAME`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`† | + +**Variables marked with † are shared across multiple observability systems.** + +## Developer Guides + +| Guide | Description | Environment Variables to Control | +|-------|-------------|----------------------------------| +| [Metrics Developer Guide](metrics-developer-guide.md) | Creating custom metrics in Rust and Python | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`† | + +## Kubernetes + +For Kubernetes-specific setup and configuration, see [docs/kubernetes/observability/](../kubernetes/observability/). + diff --git a/docs/observability/health-checks.md b/docs/observability/health-checks.md index 9e77f3202b..2213b2bc10 100644 --- a/docs/observability/health-checks.md +++ b/docs/observability/health-checks.md @@ -11,6 +11,41 @@ Dynamo provides health check and liveness HTTP endpoints for each component whic can be used to configure startup, liveness and readiness probes in orchestration frameworks such as Kubernetes. +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_ENABLED` | Enable system status server | `false` | `true` | +| `DYN_SYSTEM_PORT` | System status server port | `8081` | `9090` | +| `DYN_SYSTEM_STARTING_HEALTH_STATUS` | Initial health status | `notready` | `ready`, `notready` | +| `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` | +| `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` | +| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` | + +## Getting Started (Single GPU) + +Enable health checks and query endpoints: + +```bash +# Enable system status server +export DYN_SYSTEM_ENABLED=true +export DYN_SYSTEM_PORT=8081 + +# Start your Dynamo components +python -m dynamo.frontend --http-port 8000 & +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & +``` + +Check health status: + +```bash +# Frontend health (port 8000) +curl -s localhost:8000/health | jq + +# Worker health (port 8081) +curl -s localhost:8081/health | jq +``` + ## Frontend Liveness Check The frontend liveness endpoint reports a status of `live` as long as @@ -124,17 +159,6 @@ when initializing and HTTP status code `HTTP/1.1 200 OK` once ready. > **Note**: Both /live and /ready return the same information -### Environment Variables for Enabling Health Checks - -| **Environment Variable** | **Description** | **Example Settings** | -| -------------------------| ------------------- | ------------------------------------------------ | -| `DYN_SYSTEM_ENABLED` | Enables the system status server. | `true`, `false` | -| `DYN_SYSTEM_PORT` | Specifies the port for the system status server. | `9090` | -| `DYN_SYSTEM_STARTING_HEALTH_STATUS` | Sets the initial health status of the system (ready/not ready). | `ready`, `notready` | -| `DYN_SYSTEM_HEALTH_PATH` | Custom path for the health endpoint. | `/custom/health` | -| `DYN_SYSTEM_LIVE_PATH` | Custom path for the liveness endpoint. | `/custom/live` | -| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Specifies endpoints to check for determining overall system health status. | `["generate"]` | - ### Example Environment Setting ``` diff --git a/docs/observability/logging.md b/docs/observability/logging.md index fac8ff910d..8b811d0649 100644 --- a/docs/observability/logging.md +++ b/docs/observability/logging.md @@ -24,18 +24,32 @@ JSONL is enabled logs additionally contain `span` creation and exit events as well as support for `trace_id` and `span_id` fields for distributed tracing. -## Environment Variables for configuring Logging +## Environment Variables -| Environment Variable | Description | Example Settings | -| ----------------------------------- | --------------------------------------------| ---------------------------------------------------- | -| `DYN_LOGGING_JSONL` | Enable JSONL logging format (default: READABLE) | `DYN_LOGGING_JSONL=true` | -| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for logging timestamps (default: UTC) | `DYN_LOG_USE_LOCAL_TZ=1` | -| `DYN_LOG` | Log levels per target `,=,=` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | -| `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration file | `DYN_LOGGING_CONFIG_PATH=/path/to/config.toml`| -| `OTEL_SERVICE_NAME` | Service name for OpenTelemetry traces (default: `dynamo`) | `OTEL_SERVICE_NAME=dynamo-frontend` | -| `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting (set to `1` to enable) | `OTEL_EXPORT_ENABLED=1` | -| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint (default: http://localhost:4317) | `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317` | +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format | `false` | `true` | +| `DYN_LOG` | Log level: `info` or `debug` | `info` | `debug` | +| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps | `false` | `true` | +| `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration | none | `/path/to/config.toml` | +| `OTEL_SERVICE_NAME` | Service name for OpenTelemetry traces | `dynamo` | `dynamo-frontend` | +| `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting | `false` | `true` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint | `http://localhost:4317` | `http://tempo:4317` | +## Getting Started + +Enable structured JSONL logging: + +```bash +export DYN_LOGGING_JSONL=true +export DYN_LOG=debug + +# Start your Dynamo components +python -m dynamo.frontend --http-port 8000 & +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & +``` + +Logs will be written to stderr in JSONL format with trace context. ## Available Logging Levels @@ -85,68 +99,55 @@ Resulting Log format: {"time":"2025-09-02T15:53:31.943747Z","level":"INFO","target":"log","message":"Scheduler config values: {'max_num_seqs': 256, 'max_num_batched_tokens': 2048}","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":268,"log.target":"main.get_engine_cache_info"} ``` -## OpenTelemetry Distributed Tracing - -When `DYN_LOGGING_JSONL` is enabled, Dynamo uses OpenTelemetry for distributed tracing. All logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. By default, traces are **not exported**. To export traces to an observability backend (like Tempo, Jaeger, or Zipkin), set `OTEL_EXPORT_ENABLED=1`. +## Logging of OpenTelemetry Tracing -### Behavior +When `DYN_LOGGING_JSONL` is enabled, Dynamo uses OpenTelemetry for distributed tracing. All logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend. -- **With `DYN_LOGGING_JSONL=true` only**: OpenTelemetry layer is active, generating trace context and span IDs for all requests. Traces appear in logs but are not exported anywhere. -- **With `OTEL_EXPORT_ENABLED=1` and `DYN_LOGGING_JSONL=true`**: Same as above, plus traces are exported to an OTLP collector for visualization. +**Note:** This section has overlap with [Distributed Tracing with Tempo](tracing.md) since OpenTelemetry has aspects of both logging and tracing. For trace visualization in Grafana Tempo and persistent trace analysis, see [Distributed Tracing with Tempo](tracing.md). -### Configuration +### Configuration for Logging -To enable OTLP trace exporting: - -1. Set `OTEL_EXPORT_ENABLED=1` to enable trace export -2. Optionally configure the endpoint using `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` (default: `http://localhost:4317`) -3. Optionally set `OTEL_SERVICE_NAME` to identify the service (useful in Kubernetes, default: `dynamo`) - -**Export Settings:** -- **Protocol**: gRPC (Tonic) -- **Service Name**: Value of `OTEL_SERVICE_NAME` env var, or `dynamo` if not set -- **Endpoint**: Value of `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` env var, or `http://localhost:4317` if not set - -### Example: JSONL Logging Only (No Export) +To see OpenTelemetry trace information in logs: ```bash export DYN_LOGGING_JSONL=true -# OpenTelemetry is active, traces appear in logs, but nothing is exported +export DYN_LOG=debug # Set to debug to see detailed trace logs + +# Start your Dynamo components (e.g., frontend and worker) +python -m dynamo.frontend --http-port 8000 & +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & ``` -### Example: JSONL Logging + Trace Export to Tempo +This enables JSONL logging with `trace_id` and `span_id` fields. Traces appear in logs but are not exported to any backend. + +### Example Request + +Send a request to generate logs with trace context: ```bash -export DYN_LOGGING_JSONL=true -export OTEL_EXPORT_ENABLED=1 -export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317 -export OTEL_SERVICE_NAME=dynamo-frontend -# OpenTelemetry is active, traces appear in logs AND are exported to Tempo +curl -H 'Content-Type: application/json' \ +-H 'x-request-id: test-trace-001' \ +-d '{ + "model": "Qwen/Qwen3-0.6B", + "max_completion_tokens": 100, + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] +}' \ +http://localhost:8000/v1/chat/completions ``` -## Trace and Span Information +Check the logs (stderr) for JSONL output containing `trace_id`, `span_id`, and `x_request_id` fields. -### Example Request +## Trace and Span Information in Logs -```sh -curl -X POST http://localhost:8000/v1/chat/completions \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "Qwen/Qwen3-0.6B", - "messages": [ - { - "role": "user", - "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time" - } - ], - "stream": true, - "max_tokens": 1000, - }' -``` +This section shows how OpenTelemetry trace and span information appears in JSONL logs. These logs can be used to understand request flows even without a trace visualization backend. + +### Example Disaggregated Trace in Grafana When viewing the corresponding trace in Grafana, you should be able to see something like the following: -![Trace Example](./grafana-disagg-trace.png) +![Disaggregated Trace Example](grafana-disagg-trace.png) ### Trace Overview @@ -208,7 +209,7 @@ When viewing the corresponding trace in Grafana, you should be able to see somet | **Busy Time** | 3,795,258 ns (3.79ms) | | **Idle Time** | 3,996,532,471 ns (3.99s) | -### Frontend Logs +### Frontend Logs with Trace Context The following shows the JSONL logs from the frontend service for the same request. Note the `trace_id` field (`b672ccf48683b392891c5cb4163d4b51`) that correlates all logs for this request, and the `span_id` field that identifies individual operations: @@ -220,7 +221,7 @@ The following shows the JSONL logs from the frontend service for the same reques {"time":"2025-10-31T20:52:10.745545Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} ``` -## Custom Request IDs +## Custom Request IDs in Logs You can provide a custom request ID using the `x-request-id` header. This ID will be attached to all spans and logs for that request, making it easier to correlate traces with application-level request tracking. @@ -238,7 +239,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \ "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time" } ], - "stream": true, + "stream": false, "max_tokens": 1000 }' ``` diff --git a/docs/observability/metrics-developer-guide.md b/docs/observability/metrics-developer-guide.md new file mode 100644 index 0000000000..c07d235751 --- /dev/null +++ b/docs/observability/metrics-developer-guide.md @@ -0,0 +1,270 @@ + + +# Metrics Developer Guide + +This guide explains how to create and use custom metrics in Dynamo components using the Dynamo metrics API. + +## Metrics Exposure + +All metrics created via the Dynamo metrics API are automatically exposed on the `/metrics` HTTP endpoint in Prometheus Exposition Format text when the following environment variables are set: + +- `DYN_SYSTEM_ENABLED=true` - Enable the system metrics server +- `DYN_SYSTEM_PORT=` - Port for the metrics endpoint (default: `8081`) + +Example: +```bash +DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model +``` + +Prometheus Exposition Format text metrics will be available at: `http://localhost:8081/metrics` + +## Metric Name Constants + +The [prometheus_names.rs](../../lib/runtime/src/metrics/prometheus_names.rs) module provides centralized metric name constants and sanitization functions to ensure consistency across all Dynamo components. + +--- + +## Metrics API in Rust + +The metrics API is accessible through the `.metrics()` method on runtime, namespace, component, and endpoint objects. See [Runtime Hierarchy](metrics.md#runtime-hierarchy) for details on the hierarchical structure. + +### Available Methods + +- `.metrics().create_counter()`: Create a counter metric +- `.metrics().create_gauge()`: Create a gauge metric +- `.metrics().create_histogram()`: Create a histogram metric +- `.metrics().create_countervec()`: Create a counter with labels +- `.metrics().create_gaugevec()`: Create a gauge with labels +- `.metrics().create_histogramvec()`: Create a histogram with labels + +### Creating Metrics + +```rust +use dynamo_runtime::DistributedRuntime; + +let runtime = DistributedRuntime::new()?; +let endpoint = runtime.namespace("my_namespace").component("my_component").endpoint("my_endpoint"); + +// Simple metrics +let requests_total = endpoint.metrics().create_counter( + "requests_total", + "Total requests", + &[] +)?; + +let active_connections = endpoint.metrics().create_gauge( + "active_connections", + "Active connections", + &[] +)?; + +let latency = endpoint.metrics().create_histogram( + "latency_seconds", + "Request latency", + &[], + Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) +)?; +``` + +### Using Metrics + +```rust +// Counters +requests_total.inc(); + +// Gauges +active_connections.set(42.0); +active_connections.inc(); +active_connections.dec(); + +// Histograms +latency.observe(0.023); // 23ms +``` + +### Vector Metrics with Labels + +```rust +// Create vector metrics with label names +let requests_by_model = endpoint.metrics().create_countervec( + "requests_by_model", + "Requests by model", + &["model_type", "model_size"], + &[] +)?; + +let memory_by_gpu = endpoint.metrics().create_gaugevec( + "gpu_memory_bytes", + "GPU memory by device", + &["gpu_id", "memory_type"], + &[] +)?; + +// Use with specific label values +requests_by_model.with_label_values(&["llama", "7b"]).inc(); +memory_by_gpu.with_label_values(&["0", "allocated"]).set(8192.0); +``` + +### Advanced Features + +**Custom histogram buckets:** +```rust +let latency = endpoint.metrics().create_histogram( + "latency_seconds", + "Request latency", + &[], + Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) +)?; +``` + +**Constant labels:** +```rust +let counter = endpoint.metrics().create_counter( + "requests_total", + "Total requests", + &[("region", "us-west"), ("env", "prod")] +)?; +``` + +--- + +## Metrics API in Python + +Python components can create and manage Prometheus metrics using the same metrics API through Python bindings. + +### Available Methods + +- `endpoint.metrics.create_counter()` / `create_intcounter()`: Create a counter metric +- `endpoint.metrics.create_gauge()` / `create_intgauge()`: Create a gauge metric +- `endpoint.metrics.create_histogram()`: Create a histogram metric +- `endpoint.metrics.create_countervec()` / `create_intcountervec()`: Create a counter with labels +- `endpoint.metrics.create_gaugevec()` / `create_intgaugevec()`: Create a gauge with labels +- `endpoint.metrics.create_histogramvec()`: Create a histogram with labels + +All metrics are imported from `dynamo.prometheus_metrics`. + +### Creating Metrics + +```python +from dynamo.runtime import DistributedRuntime + +drt = DistributedRuntime() +endpoint = drt.namespace("my_namespace").component("my_component").endpoint("my_endpoint") + +# Simple metrics +requests_total = endpoint.metrics.create_intcounter( + "requests_total", + "Total requests" +) + +active_connections = endpoint.metrics.create_intgauge( + "active_connections", + "Active connections" +) + +latency = endpoint.metrics.create_histogram( + "latency_seconds", + "Request latency", + buckets=[0.001, 0.01, 0.1, 1.0, 10.0] +) +``` + +### Using Metrics + +```python +# Counters +requests_total.inc() +requests_total.inc_by(5) + +# Gauges +active_connections.set(42) +active_connections.inc() +active_connections.dec() + +# Histograms +latency.observe(0.023) # 23ms +``` + +### Vector Metrics with Labels + +```python +# Create vector metrics with label names +requests_by_model = endpoint.metrics.create_intcountervec( + "requests_by_model", + "Requests by model", + ["model_type", "model_size"] +) + +memory_by_gpu = endpoint.metrics.create_intgaugevec( + "gpu_memory_bytes", + "GPU memory by device", + ["gpu_id", "memory_type"] +) + +# Use with specific label values +requests_by_model.inc({"model_type": "llama", "model_size": "7b"}) +memory_by_gpu.set(8192, {"gpu_id": "0", "memory_type": "allocated"}) +``` + +### Advanced Features + +**Constant labels:** +```python +counter = endpoint.metrics.create_intcounter( + "requests_total", + "Total requests", + [("region", "us-west"), ("env", "prod")] +) +``` + +**Metric introspection:** +```python +print(counter.name()) # "my_namespace_my_component_my_endpoint_requests_total" +print(counter.const_labels()) # {"dynamo_namespace": "my_namespace", ...} +print(gauge_vec.variable_labels()) # ["model_type", "model_size"] +``` + +**Update patterns:** + +Background thread updates: +```python +import threading +import time + +def update_loop(): + while True: + active_connections.set(compute_current_connections()) + time.sleep(2) + +threading.Thread(target=update_loop, daemon=True).start() +``` + +Callback-based updates (called before each `/metrics` scrape): +```python +def update_metrics(): + active_connections.set(compute_current_connections()) + +endpoint.metrics.register_callback(update_metrics) +``` + +### Examples + +Example scripts: [lib/bindings/python/examples/metrics/](../../lib/bindings/python/examples/metrics/) + +```bash +cd ~/dynamo/lib/bindings/python/examples/metrics +DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_loop.py +DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_callback.py +``` + +--- + +## Related Documentation + +- [Metrics Overview](metrics.md) +- [Prometheus and Grafana Setup](prometheus-grafana.md) +- [Distributed Runtime Architecture](../design_docs/distributed_runtime.md) +- [Python Metrics Examples](../../lib/bindings/python/examples/metrics/) + diff --git a/docs/observability/metrics.md b/docs/observability/metrics.md index 7e2beb34c5..e947285545 100644 --- a/docs/observability/metrics.md +++ b/docs/observability/metrics.md @@ -3,27 +3,88 @@ SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All SPDX-License-Identifier: Apache-2.0 --> -# Dynamo MetricsRegistry +# Dynamo Metrics ## Overview -Dynamo provides built-in metrics capabilities through the `MetricsRegistry` trait, which is automatically available whenever you use the `DistributedRuntime` framework. This guide explains how to use metrics for observability and monitoring across all Dynamo components. +Dynamo provides built-in metrics capabilities through the Dynamo metrics API, which is automatically available whenever you use the `DistributedRuntime` framework. This document serves as a reference for all available metrics in Dynamo. -## Automatic Metrics +**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](prometheus-grafana.md). -Dynamo automatically exposes metrics with the `dynamo_` name prefixes. It also adds the following labels `dynamo_namespace`, `dynamo_component`, and `dynamo_endpoint` to indicate which component is providing the metric. +**For creating custom metrics**, see the [Metrics Developer Guide](metrics-developer-guide.md). -**Frontend Metrics**: When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name. These cover request handling, token processing, and latency measurements. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for the complete list of frontend metrics. +## Environment Variables -**Component Metrics**: The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework. These include request counts, processing times, byte transfers, and system uptime metrics. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for the complete list of component metrics. +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_ENABLED` | Enable system metrics/health server | `false` | `true` | +| `DYN_SYSTEM_PORT` | System metrics/health port | `8081` | `9090` | -**Specialized Component Metrics**: Components can also expose additional metrics specific to their functionality. For example, a `preprocessor` component exposes metrics with the `dynamo_preprocessor_*` prefix. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for details on specialized component metrics. +## Getting Started (Single GPU) -**Kubernetes Integration**: For comprehensive Kubernetes deployment and monitoring setup, see the [Kubernetes Metrics Guide](../kubernetes/observability/metrics.md). This includes Prometheus Operator setup, metrics collection configuration, and visualization in Grafana. +**Note:** This requires NATS and etcd running. For a complete setup with Prometheus and Grafana visualization, see the [Prometheus and Grafana Setup Guide](prometheus-grafana.md). -## Metrics Hierarchy +Launch a frontend and vLLM backend to test metrics: -The `MetricsRegistry` trait is implemented by `DistributedRuntime`, `Namespace`, `Component`, and `Endpoint`, providing a hierarchical approach to metric collection that matches Dynamo's distributed architecture: +```bash +$ python -m dynamo.frontend --http-port 8000 + +# Enable system metrics server +export DYN_SYSTEM_ENABLED=true +export DYN_SYSTEM_PORT=8081 + +$ python -m dynamo.vllm --model Qwen/Qwen3-0.6B \ + --enforce-eager --no-enable-prefix-caching --max-num-seqs 3 +``` + +Wait for the vLLM worker to start, then send requests and check metrics: + +```bash +# Send a request +curl -H 'Content-Type: application/json' \ +-d '{ + "model": "Qwen/Qwen3-0.6B", + "max_completion_tokens": 100, + "messages": [{"role": "user", "content": "Hello"}] +}' \ +http://localhost:8000/v1/chat/completions + +# Check metrics from the worker +curl -s localhost:8081/metrics | grep dynamo_component +``` + +## Exposed Metrics + +Dynamo exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All Dynamo-generated metrics use the `dynamo_*` prefix and include labels (`dynamo_namespace`, `dynamo_component`, `dynamo_endpoint`) to identify the source component. + +**Example Prometheus Exposition Format text:** + +``` +# HELP dynamo_component_requests_total Total requests processed +# TYPE dynamo_component_requests_total counter +dynamo_component_requests_total{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 42 + +# HELP dynamo_component_request_duration_seconds Request processing time +# TYPE dynamo_component_request_duration_seconds histogram +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="0.005"} 10 +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="0.01"} 15 +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="+Inf"} 42 +dynamo_component_request_duration_seconds_sum{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 2.5 +dynamo_component_request_duration_seconds_count{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 42 +``` + +### Metric Categories + +Dynamo exposes several categories of metrics: + +- **Frontend Metrics** (`dynamo_frontend_*`) - Request handling, token processing, and latency measurements +- **Component Metrics** (`dynamo_component_*`) - Request counts, processing times, byte transfers, and system uptime +- **Specialized Component Metrics** (e.g., `dynamo_preprocessor_*`) - Component-specific metrics +- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm:*`) + +## Runtime Hierarchy + +The Dynamo metrics API is available on `DistributedRuntime`, `Namespace`, `Component`, and `Endpoint`, providing a hierarchical approach to metric collection that matches Dynamo's distributed architecture: - `DistributedRuntime`: Global metrics across the entire runtime - `Namespace`: Metrics scoped to a specific dynamo_namespace @@ -32,65 +93,116 @@ The `MetricsRegistry` trait is implemented by `DistributedRuntime`, `Namespace`, This hierarchical structure allows you to create metrics at the appropriate level of granularity for your monitoring needs. +## Available Metrics -## Getting Started +### Backend Component Metrics -For a complete setup guide including Docker Compose configuration, Prometheus setup, and Grafana dashboards, see the [Getting Started section](prometheus-grafana.md#getting-started) in the Prometheus and Grafana guide. +The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework: -The quick start includes: -- Docker Compose setup for Prometheus and Grafana -- Pre-configured dashboards and datasources -- Access URLs for all monitoring endpoints -- GPU targeting configuration +- `dynamo_component_inflight_requests`: Requests currently being processed (gauge) +- `dynamo_component_request_bytes_total`: Total bytes received in requests (counter) +- `dynamo_component_request_duration_seconds`: Request processing time (histogram) +- `dynamo_component_requests_total`: Total requests processed (counter) +- `dynamo_component_response_bytes_total`: Total bytes sent in responses (counter) +- `dynamo_component_system_uptime_seconds`: DistributedRuntime uptime (gauge) -## Implementation Examples +### KV Router Statistics (kvstats) -Examples of creating metrics at different hierarchy levels and using dynamic labels are included in this document below. +KV router statistics are automatically exposed by LLM workers and KV router components with the `dynamo_component_kvstats_*` prefix. These metrics provide insights into GPU memory usage and cache efficiency: -### Grafana Dashboards +- `dynamo_component_kvstats_active_blocks`: Number of active KV cache blocks currently in use (gauge) +- `dynamo_component_kvstats_total_blocks`: Total number of KV cache blocks available (gauge) +- `dynamo_component_kvstats_gpu_cache_usage_percent`: GPU cache usage as a percentage (0.0-1.0) (gauge) +- `dynamo_component_kvstats_gpu_prefix_cache_hit_rate`: GPU prefix cache hit rate as a percentage (0.0-1.0) (gauge) -Use dashboards in `deploy/metrics/grafana_dashboards/`: -- `grafana-dynamo-dashboard.json`: General Dynamo dashboard -- `grafana-dcgm-metrics.json`: DCGM GPU metrics dashboard +These metrics are published by: +- **LLM Workers**: vLLM and TRT-LLM backends publish these metrics through their respective publishers +- **KV Router**: The KV router component aggregates and exposes these metrics for load balancing decisions -## Metrics Visualization Architecture +### Specialized Component Metrics -### Service Topology +Some components expose additional metrics specific to their functionality: -The metrics system follows this architecture for collecting and visualizing metrics: +- `dynamo_preprocessor_*`: Metrics specific to preprocessor components -```mermaid -graph TD - BROWSER[Browser] -->|:3001| GRAFANA[Grafana :3001] - subgraph DockerComposeNetwork [Network inside Docker Compose] - NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] - PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] - PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] - PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP - PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] - PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] - DYNAMOFE --> DYNAMOBACKEND - GRAFANA -->|:9090/query API| PROMETHEUS - end -``` +### Frontend Metrics + +When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name: + +- `dynamo_frontend_inflight_requests`: Inflight requests (gauge) +- `dynamo_frontend_queued_requests`: Number of requests in HTTP processing queue (gauge) +- `dynamo_frontend_input_sequence_tokens`: Input sequence length (histogram) +- `dynamo_frontend_inter_token_latency_seconds`: Inter-token latency (histogram) +- `dynamo_frontend_output_sequence_tokens`: Output sequence length (histogram) +- `dynamo_frontend_request_duration_seconds`: LLM request duration (histogram) +- `dynamo_frontend_requests_total`: Total LLM requests (counter) +- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram) + +**Note**: The `dynamo_frontend_inflight_requests` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time. + +#### Model Configuration Metrics + +The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system: -### Grafana Dashboard +**Runtime Config Metrics (from ModelRuntimeConfig):** +These metrics come from the runtime configuration provided by worker backends during registration. -The metrics system includes a pre-configured Grafana dashboard for visualizing service metrics: +- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge) +- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge) +- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge) -![Grafana Dynamo Dashboard](./grafana-dynamo-composite.png) +**MDC Metrics (from ModelDeploymentCard):** +These metrics come from the Model Deployment Card information provided by worker backends during registration. Note that when multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances. -## Detailed Setup Guide +- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge) +- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge) +- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge) -For complete setup instructions including Docker Compose, Prometheus configuration, and Grafana dashboards, see: +**Worker Management Metrics:** +- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge) -```{toctree} -:hidden: +### Request Processing Flow -prometheus-grafana +This section explains the distinction between two key metrics used to track request processing: + +1. **Inflight**: Tracks requests from HTTP handler start until the complete response is finished +2. **HTTP Queue**: Tracks requests from HTTP handler start until first token generation begins (including prefill time) + +**Example Request Flow:** ``` +curl -s localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ + "model": "Qwen/Qwen3-0.6B", + "prompt": "Hello let's talk about LLMs", + "stream": false, + "max_tokens": 1000 +}' +``` + +**Timeline:** +``` +Timeline: 0, 1, ... +Client ────> Frontend:8000 ────────────────────> Dynamo component/backend (vLLM, SGLang, TRT) + │request start │received │ + | | | + │ ├──> start prefill ──> first token ──> |last token + │ │ (not impl) | | + ├─────actual HTTP queue¹ ──────────┘ │ | + │ │ │ + ├─────implemented HTTP queue ─────────────────────────────┘ | + │ │ + └─────────────────────────────────── Inflight ────────────────────────────┘ +``` + +**Concurrency Example:** +Suppose the backend allows 3 concurrent requests and there are 10 clients continuously hitting the frontend: +- All 10 requests will be counted as inflight (from start until complete response) +- 7 requests will be in HTTP queue most of the time +- 3 requests will be actively processed (between first token and last token) -- [Prometheus and Grafana Setup Guide](prometheus-grafana.md) +**Key Differences:** +- **Inflight**: Measures total request lifetime including processing time +- **HTTP Queue**: Measures queuing time before processing begins (including prefill time) +- **HTTP Queue ≤ Inflight** (HTTP queue is a subset of inflight time) ## Related Documentation diff --git a/docs/observability/prometheus-grafana.md b/docs/observability/prometheus-grafana.md index 6c6bcec60c..8789f888e4 100644 --- a/docs/observability/prometheus-grafana.md +++ b/docs/observability/prometheus-grafana.md @@ -1,254 +1,134 @@ # Metrics Visualization with Prometheus and Grafana -This directory contains configuration for visualizing metrics from the metrics aggregation service using Prometheus and Grafana. - -> [!NOTE] -> For detailed information about Dynamo's metrics system, including hierarchical metrics, automatic labeling, and usage examples, see the [Metrics Guide](./metrics.md). - ## Overview -### Components - -- **Prometheus Server**: Collects and stores metrics from Dynamo services and other components. -- **Grafana**: Provides dashboards by querying the Prometheus Server. - -### Topology - -Default Service Relationship Diagram: -```mermaid -graph TD - BROWSER[Browser] -->|:3001| GRAFANA[Grafana :3001] - subgraph DockerComposeNetwork [Network inside Docker Compose] - NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] - PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] - PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] - PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP - PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] - PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] - DYNAMOFE --> DYNAMOBACKEND - GRAFANA -->|:9090/query API| PROMETHEUS - end -``` - -The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM. - -As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build containers with `--framework VLLM` or `--framework TRTLLM`. - -### Available Metrics - -#### Backend Component Metrics - -The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework: - -- `dynamo_component_inflight_requests`: Requests currently being processed (gauge) -- `dynamo_component_request_bytes_total`: Total bytes received in requests (counter) -- `dynamo_component_request_duration_seconds`: Request processing time (histogram) -- `dynamo_component_requests_total`: Total requests processed (counter) -- `dynamo_component_response_bytes_total`: Total bytes sent in responses (counter) -- `dynamo_component_system_uptime_seconds`: DistributedRuntime uptime (gauge) - -#### KV Router Statistics (kvstats) - -KV router statistics are automatically exposed by LLM workers and KV router components with the `dynamo_component_kvstats_*` prefix. These metrics provide insights into GPU memory usage and cache efficiency: - -- `dynamo_component_kvstats_active_blocks`: Number of active KV cache blocks currently in use (gauge) -- `dynamo_component_kvstats_total_blocks`: Total number of KV cache blocks available (gauge) -- `dynamo_component_kvstats_gpu_cache_usage_percent`: GPU cache usage as a percentage (0.0-1.0) (gauge) -- `dynamo_component_kvstats_gpu_prefix_cache_hit_rate`: GPU prefix cache hit rate as a percentage (0.0-1.0) (gauge) - -These metrics are published by: -- **LLM Workers**: vLLM and TRT-LLM backends publish these metrics through their respective publishers -- **KV Router**: The KV router component aggregates and exposes these metrics for load balancing decisions - -#### Specialized Component Metrics - -Some components expose additional metrics specific to their functionality: - -- `dynamo_preprocessor_*`: Metrics specific to preprocessor components - -#### Frontend Metrics +This guide shows how to set up Prometheus and Grafana for visualizing Dynamo metrics on a single machine for demo purposes. -When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name: +![Grafana Dynamo Dashboard](./grafana-dynamo-composite.png) -- `dynamo_frontend_inflight_requests`: Inflight requests (gauge) -- `dynamo_frontend_queued_requests`: Number of requests in HTTP processing queue (gauge) -- `dynamo_frontend_input_sequence_tokens`: Input sequence length (histogram) -- `dynamo_frontend_inter_token_latency_seconds`: Inter-token latency (histogram) -- `dynamo_frontend_output_sequence_tokens`: Output sequence length (histogram) -- `dynamo_frontend_request_duration_seconds`: LLM request duration (histogram) -- `dynamo_frontend_requests_total`: Total LLM requests (counter) -- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram) +**Components:** +- **Prometheus Server** - Collects and stores metrics from Dynamo services +- **Grafana** - Provides dashboards by querying the Prometheus Server -**Note**: The `dynamo_frontend_inflight_requests` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time. +**For metrics reference**, see [Metrics Documentation](metrics.md). -##### Model Configuration Metrics +## Environment Variables -The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system: +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_ENABLED` | Enable system metrics/health server | `false` | `true` | +| `DYN_SYSTEM_PORT` | System metrics/health port | `8081` | `9090` | -**Runtime Config Metrics (from ModelRuntimeConfig):** -These metrics come from the runtime configuration provided by worker backends during registration. +## Getting Started (Single GPU) -- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge) -- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge) -- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge) - -**MDC Metrics (from ModelDeploymentCard):** -These metrics come from the Model Deployment Card information provided by worker backends during registration. Note that when multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances. +### Prerequisites -- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge) -- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge) -- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge) +Install these on your machine: -**Worker Management Metrics:** -- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge) +- [Docker](https://docs.docker.com/get-docker/) +- [Docker Compose](https://docs.docker.com/compose/install/) -#### Request Processing Flow +### Start the Observability Stack -This section explains the distinction between two key metrics used to track request processing: +From the Dynamo root directory: -1. **Inflight**: Tracks requests from HTTP handler start until the complete response is finished -2. **HTTP Queue**: Tracks requests from HTTP handler start until first token generation begins (including prefill time) +```bash +# Start infrastructure (NATS, etcd) +docker compose -f deploy/docker-compose.yml up -d -**Example Request Flow:** -``` -curl -s localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ - "model": "Qwen/Qwen3-0.6B", - "prompt": "Hello let's talk about LLMs", - "stream": false, - "max_tokens": 1000 -}' +# Then start observability stack (Prometheus, Grafana, Tempo, DCGM GPU exporter, NATS exporter) +docker compose -f deploy/docker-observability.yml up -d ``` -**Timeline:** -``` -Timeline: 0, 1, ... -Client ────> Frontend:8000 ────────────────────> Dynamo component/backend (vLLM, SGLang, TRT) - │request start │received │ - | | | - │ ├──> start prefill ──> first token ──> |last token - │ │ (not impl) | | - ├─────actual HTTP queue¹ ──────────┘ │ | - │ │ │ - ├─────implemented HTTP queue ─────────────────────────────┘ | - │ │ - └─────────────────────────────────── Inflight ────────────────────────────┘ -``` +### Start Dynamo Components -**Concurrency Example:** -Suppose the backend allows 3 concurrent requests and there are 10 clients continuously hitting the frontend: -- All 10 requests will be counted as inflight (from start until complete response) -- 7 requests will be in HTTP queue most of the time -- 3 requests will be actively processed (between first token and last token) +Start frontend and worker (a simple single GPU example): -**Testing Setup:** -Try launching a frontend and a Mocker backend that allows 3 concurrent requests: ```bash -$ python -m dynamo.frontend --http-port 8000 -$ python -m dynamo.mocker --model-path Qwen/Qwen3-0.6B --max-num-seqs 3 -# Launch your 10 concurrent clients here -# Then check the queued_requests and inflight_requests metrics from the frontend: -$ curl -s localhost:8000/metrics|grep -v '^#'|grep -E 'queue|inflight' -dynamo_frontend_queued_requests{model="qwen/qwen3-0.6b"} 7 -dynamo_frontend_inflight_requests{model="qwen/qwen3-0.6b"} 10 -``` +# Start frontend in one process +python -m dynamo.frontend --http-port 8000 & -**Real setup using vLLM (instead of Mocker):** -```bash -$ python -m dynamo.vllm --model Qwen/Qwen3-0.6B \ - --enforce-eager --no-enable-prefix-caching --max-num-seqs 3 +# Start vLLM worker with metrics enabled in another process +DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ + python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager ``` -**Key Differences:** -- **Inflight**: Measures total request lifetime including processing time -- **HTTP Queue**: Measures queuing time before processing begins (including prefill time) -- **HTTP Queue ≤ Inflight** (HTTP queue is a subset of inflight time) - -### Required Files - -The following configuration files are located in the `deploy/metrics/` directory: -- [docker-compose.yml](../../deploy/docker-compose.yml): Defines the Prometheus and Grafana services -- [prometheus.yml](../../deploy/metrics/prometheus.yml): Contains Prometheus scraping configuration -- [grafana-datasources.yml](../../deploy/metrics/grafana-datasources.yml): Contains Grafana datasource configuration -- [grafana_dashboards/grafana-dashboard-providers.yml](../../deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration -- [grafana_dashboards/grafana-dynamo-dashboard.json](../../deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics. -- [grafana_dashboards/grafana-dcgm-metrics.json](../../deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics -- [grafana_dashboards/grafana-kvbm-dashboard.json](../../deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics - -### Metric Name Constants - -The [prometheus_names.rs](../../lib/runtime/src/metrics/prometheus_names.rs) module provides centralized Prometheus metric name constants and sanitization utilities for the Dynamo metrics system. This module ensures consistency across all components and prevents metric name duplication. - -#### Key Features - -- **Centralized Constants**: All Prometheus metric names are defined as constants to avoid duplication and typos -- **Automatic Sanitization**: Functions to sanitize metric and label names according to Prometheus naming rules -- **Component Organization**: Metric names are organized by component (frontend, work_handler, nats_client, etc.) -- **Validation Arrays**: Arrays of metric names for iteration and validation purposes - -#### Metric Name Prefixes - -- `dynamo_component_*`: Core component metrics (requests, latency, bytes, etc.) -- `dynamo_frontend_*`: Frontend service metrics (LLM HTTP service) -- `nats_client_*`: NATS client connection and message metrics -- `nats_service_*`: NATS service statistics metrics -- `kvstats_*`: KV cache statistics from LLM workers +After the workers are running, send a few test requests to populate metrics in the system: -#### Sanitization Functions - -The module provides functions to ensure metric and label names comply with Prometheus naming conventions: - -- `sanitize_prometheus_name()`: Sanitizes metric names (allows colons and `__`) -- `sanitize_prometheus_label()`: Sanitizes label names (no colons, no `__` prefix) -- `build_component_metric_name()`: Builds full component metric names with proper prefixing +```bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [{"role": "user", "content": "Hello"}], + "max_completion_tokens": 100 + }' +``` -This centralized approach ensures all Dynamo components use consistent, valid Prometheus metric names without manual coordination. +After sending a few requests, the Prometheus Exposition Format text metrics are available at: +- Frontend: `http://localhost:8000/metrics` +- Backend worker: `http://localhost:8081/metrics` -## Getting Started +### Access Web Interfaces -### Prerequisites +Once Dynamo components are running: -1. Make sure Docker and Docker Compose are installed on your system +1. Open **Grafana** at `http://localhost:3000` (username: `dynamo`, password: `dynamo`) +2. Click on **Dashboards** in the left sidebar +3. Select **Dynamo Dashboard** to view metrics and traces -### Quick Start +Other interfaces: +- **Prometheus**: `http://localhost:9090` +- **Tempo** (tracing): Accessible through Grafana's Explore view. See [Tracing Guide](tracing.md) for details. -1. Start Dynamo dependencies. Assume you're at the root dynamo path: +**Note:** If accessing from another machine, replace `localhost` with the machine's hostname or IP address, and ensure firewall rules allow access to these ports (3000, 9090). - ```bash - # Start the basic services (etcd & natsd), along with Prometheus and Grafana - docker compose -f deploy/docker-compose.yml --profile metrics up -d +--- - # Minimum components for Dynamo (will not have Prometheus and Grafana): etcd/nats/dcgm-exporter - docker compose -f deploy/docker-compose.yml up -d - ``` +## Topology - Optional: To target specific GPU(s), export the variable below before running Docker Compose - ```bash - export CUDA_VISIBLE_DEVICES=0,2 - ``` +Default Service Relationship Diagram: +```mermaid +graph TD + BROWSER[Browser] -->|:3000| GRAFANA[Grafana :3000] + subgraph DockerComposeNetwork [Network inside Docker Compose] + NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] + PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] + PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] + PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP + PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] + PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] + DYNAMOFE --> DYNAMOBACKEND + GRAFANA -->|:9090/query API| PROMETHEUS + end +``` -2. Web servers started. The ones that end in /metrics are in Prometheus format: - - Grafana: `http://localhost:3001` (default login: dynamo/dynamo) - - Prometheus Server: `http://localhost:9090` - - NATS Server: `http://localhost:8222` (monitoring endpoints: /varz, /healthz, etc.) - - NATS Prometheus Exporter: `http://localhost:7777/metrics` - - etcd Server: `http://localhost:2379/metrics` - - DCGM Exporter: `http://localhost:9401/metrics` +The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM. +### Required Files - - Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics. +The following configuration files are located in the `deploy/observability/` directory: +- [docker-compose.yml](../../deploy/docker-compose.yml): Defines NATS and etcd services +- [docker-observability.yml](../../deploy/docker-observability.yml): Defines Prometheus, Grafana, Tempo, and exporters +- [prometheus.yml](../../deploy/observability/prometheus.yml): Contains Prometheus scraping configuration +- [grafana-datasources.yml](../../deploy/observability/grafana-datasources.yml): Contains Grafana datasource configuration +- [grafana_dashboards/grafana-dashboard-providers.yml](../../deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration +- [grafana_dashboards/grafana-dynamo-dashboard.json](../../deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics +- [grafana_dashboards/grafana-dcgm-metrics.json](../../deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics +- [grafana_dashboards/grafana-kvbm-dashboard.json](../../deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics ### Configuration #### Prometheus -The Prometheus configuration is specified in [prometheus.yml](../../deploy/metrics/prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint. +The Prometheus configuration is specified in [prometheus.yml](../../deploy/observability/prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint. Please be aware that you might need to modify the target settings to align with your specific host configuration and network environment. -After making changes to prometheus.yml, it is necessary to reload the configuration using the command below. Simply sending a kill -HUP signal will not suffice due to the caching of the volume that contains the prometheus.yml file. +After making changes to prometheus.yml, restart the Prometheus service: -``` -docker compose -f deploy/docker-compose.yml up prometheus -d --force-recreate +```bash +docker compose -f deploy/docker-observability.yml restart prometheus ``` #### Grafana @@ -256,237 +136,32 @@ docker compose -f deploy/docker-compose.yml up prometheus -d --force-recreate Grafana is pre-configured with: - Prometheus datasource - Sample dashboard for visualizing service metrics -![grafana image](./grafana-dynamo-composite.png) ### Troubleshooting 1. Verify services are running: ```bash - docker compose ps + docker compose -f deploy/docker-observability.yml ps ``` 2. Check logs: ```bash - docker compose logs prometheus - docker compose logs grafana + docker compose -f deploy/docker-observability.yml logs prometheus + docker compose -f deploy/docker-observability.yml logs grafana ``` 3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection. -## Developer Guide - -### Creating Metrics at Different Hierarchy Levels - -#### Runtime-Level Metrics - -```rust -use dynamo_runtime::DistributedRuntime; - -let runtime = DistributedRuntime::new()?; -let namespace = runtime.namespace("my_namespace")?; -let component = namespace.component("my_component")?; -let endpoint = component.endpoint("my_endpoint")?; - -// Create endpoint-level counters (this is a Prometheus Counter type) -let requests_total = endpoint.metrics().create_counter( - "requests_total", - "Total requests across all namespaces", - &[] -)?; - -let active_connections = endpoint.metrics().create_gauge( - "active_connections", - "Number of active client connections", - &[] -)?; -``` - -#### Namespace-Level Metrics - -```rust -let namespace = runtime.namespace("my_model")?; - -// Namespace-scoped metrics -let model_requests = namespace.metrics().create_counter( - "model_requests", - "Requests for this specific model", - &[] -)?; - -let model_latency = namespace.metrics().create_histogram( - "model_latency_seconds", - "Model inference latency", - &[], - Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) -)?; -``` - -#### Component-Level Metrics - -```rust -let component = namespace.component("backend")?; - -// Component-specific metrics -let backend_requests = component.metrics().create_counter( - "backend_requests", - "Requests handled by this backend component", - &[] -)?; - -let gpu_memory_usage = component.metrics().create_gauge( - "gpu_memory_bytes", - "GPU memory usage in bytes", - &[] -)?; -``` - -#### Endpoint-Level Metrics - -```rust -let endpoint = component.endpoint("generate")?; - -// Endpoint-specific metrics -let generate_requests = endpoint.metrics().create_counter( - "generate_requests", - "Generate endpoint requests", - &[] -)?; - -let generate_latency = endpoint.metrics().create_histogram( - "generate_latency_seconds", - "Generate endpoint latency", - &[], - Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) -)?; -``` - -### Creating Vector Metrics with Dynamic Labels - -Use vector metrics when you need to track metrics with different label values: - -```rust -// Counter with labels -let requests_by_model = endpoint.metrics().create_countervec( - "requests_by_model", - "Requests by model type", - &["model_type", "model_size"], - &[] // no constant labels -)?; - -// Increment with specific labels -requests_by_model.with_label_values(&["llama", "7b"]).inc(); -requests_by_model.with_label_values(&["gpt", "13b"]).inc(); - -// Gauge with labels -let memory_by_gpu = component.metrics().create_gaugevec( - "gpu_memory_bytes", - "GPU memory usage by device", - &["gpu_id", "memory_type"], - &[] // no constant labels -)?; - -memory_by_gpu.with_label_values(&["0", "allocated"]).set(8192.0); -memory_by_gpu.with_label_values(&["0", "cached"]).set(4096.0); -``` - -### Creating Histograms - -Histograms are useful for measuring distributions of values like latency: - -```rust -let latency_histogram = endpoint.metrics().create_histogram( - "request_latency_seconds", - "Request latency distribution", - &[], - Some(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]) -)?; - -// Record latency values -latency_histogram.observe(0.023); // 23ms -latency_histogram.observe(0.156); // 156ms -``` - -### Transitioning from Plain Prometheus - -If you're currently using plain Prometheus metrics, transitioning to Dynamo's `MetricsRegistry` is straightforward: - -#### Before (Plain Prometheus) - -```rust -use prometheus::{Counter, Opts, Registry}; - -// Create a registry to hold metrics -let registry = Registry::new(); -let counter_opts = Opts::new("my_counter", "My custom counter"); -let counter = Counter::with_opts(counter_opts).unwrap(); -registry.register(Box::new(counter.clone())).unwrap(); - -// Use the counter -counter.inc(); - -// To expose metrics, you'd need to set up an HTTP server manually -// and implement the /metrics endpoint yourself -``` - -#### After (Dynamo MetricsRegistry) - -```rust -let counter = endpoint.metrics().create_counter( - "my_counter", - "My custom counter", - &[] -)?; - -counter.inc(); -``` - -**Note:** The metric is automatically registered when created via the endpoint's `metrics().create_counter()` factory method. - -**Benefits of Dynamo's approach:** -- **Automatic registration**: Metrics created via endpoint's `metrics().create_*()` factory methods are automatically registered with the system -- Automatic labeling with namespace, component, and endpoint information -- Consistent metric naming with `dynamo_` prefix -- Built-in HTTP metrics endpoint when enabled with `DYN_SYSTEM_ENABLED=true` -- Hierarchical metric organization - -### Advanced Features - -#### Custom Buckets for Histograms - -```rust -// Define custom buckets for your use case -let custom_buckets = vec![0.001, 0.01, 0.1, 1.0, 10.0]; -let latency = endpoint.metrics().create_histogram( - "api_latency_seconds", - "API latency in seconds", - &[], - Some(custom_buckets) -)?; -``` - -#### Metric Aggregation - -```rust -// Aggregate metrics across multiple endpoints -let requests_total = namespace.metrics().create_counter( - "requests_total", - "Total requests across all endpoints", - &[] -)?; -``` - - -## Troubleshooting - -1. Verify services are running: +4. If you encounter issues with stale data or configuration, stop services and wipe volumes: ```bash - docker compose ps + docker compose -f deploy/docker-observability.yml down -v + docker compose -f deploy/docker-observability.yml up -d ``` -2. Check logs: - ```bash - docker compose logs prometheus - docker compose logs grafana - ``` + **Note:** The `-v` flag removes named volumes (grafana-data, tempo-data), which will reset dashboards and stored metrics. -3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection. +## Developer Guide + +For detailed information on creating custom metrics in Dynamo components, see: + +- [Metrics Developer Guide](metrics-developer-guide.md) diff --git a/docs/observability/tracing.md b/docs/observability/tracing.md index 4ca5cc10fd..feae4d3f29 100644 --- a/docs/observability/tracing.md +++ b/docs/observability/tracing.md @@ -5,86 +5,28 @@ SPDX-License-Identifier: Apache-2.0 # Distributed Tracing with Tempo -This guide explains how to set up and view distributed traces in Grafana Tempo for Dynamo workloads. - -> **Note:** This guide covers local single-instance deployments using Docker Compose for demonstration purposes. For distributed Kubernetes deployments, see the [Kubernetes Deployment](#kubernetes-deployment) section. - -> **💡 Tip:** For local development, use the unified observability stack at `../../deploy/docker-observability.yml`, which includes Tempo, Prometheus, Grafana, and metric exporters in one convenient stack. - ## Overview -Dynamo supports OpenTelemetry-based distributed tracing, allowing you to visualize request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana. +Dynamo supports OpenTelemetry-based distributed tracing for visualizing request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana. -## Prerequisites +**Requirements:** Set `DYN_LOGGING_JSONL=true` and `OTEL_EXPORT_ENABLED=true` to export traces to Tempo. -- Docker and Docker Compose (for local deployment) -- Kubernetes cluster with kubectl access (for Kubernetes deployment) -- Dynamo runtime with tracing support - -## Environment Variables +This guide covers single GPU demo setup using Docker Compose. For Kubernetes deployments, see [Kubernetes Deployment](#kubernetes-deployment). -Dynamo's tracing is configured via environment variables. For complete logging documentation, see [logging.md](./logging.md). +**Note:** This section has overlap with [Logging of OpenTelemetry Tracing](logging.md) since OpenTelemetry has aspects of both logging and tracing. The tracing approach documented here is for persistent trace visualization and analysis. For short debugging sessions examining trace context directly in logs, see the [Logging](logging.md) guide. -### Required Environment Variables +## Environment Variables -| Variable | Description | Default | Example Value | -|----------|-------------|---------|---------------| +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| | `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `false` | `true` | -| `DYN_LOG` | Log level (info or debug). Use debug to see detailed traces. | `info` | `debug` | -| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | disabled | `1` | -| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` | `http://tempo:4317` (docker) | -| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo` | `dynamo-frontend`, `dynamo-worker-prefill`, `dynamo-worker-decode` | - -### Tracing Behavior - -- **When `OTEL_EXPORT_ENABLED` is NOT set**: Tracing is disabled. Traces are generated locally for trace logging, but not exported to any backend. -- **When `OTEL_EXPORT_ENABLED=1`**: Traces are exported via OTLP to the endpoint specified by `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`. - - If `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` is NOT set, it defaults to `http://localhost:4317` - - If `OTEL_SERVICE_NAME` is NOT set, it defaults to `dynamo` - -**Note:** When `OTEL_EXPORT_ENABLED=1`, logging initialization is deferred until the runtime is available (required by the OTEL exporter). This means some early logs will be dropped. This will be fixed in a future release. - -### Log Levels and Visibility - -Traces are exported to Tempo/Grafana regardless of log level. Set `DYN_LOG` only if you want to view trace details in console logs for debugging. - -JSONL logs are written to **stderr**: - -- **INFO (default)**: Initialization, errors, high-level operations -- **DEBUG**: Detailed traces with trace IDs, span IDs, and routing decisions - -Example: -```bash -export DYN_LOGGING_JSONL=true -export DYN_LOG=debug -python -m dynamo.frontend --http-port 8000 -``` - -### Example Configuration +| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `false` | `true` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` | `http://tempo:4317` | +| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo` | `dynamo-frontend` | -```bash -# Enable JSONL logging and tracing -export DYN_LOGGING_JSONL=true - -# Enable trace export to Tempo -export OTEL_EXPORT_ENABLED=1 - -# Set the Tempo endpoint (docker-compose network). Note that if this is not specified, it will default to http://localhost:4317 -export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo-host-name-here:4317 - -# Set service name to identify this component -export OTEL_SERVICE_NAME=dynamo-frontend -``` - ---- - -## Local Deployment with Docker Compose - -> **Note:** The following Docker Compose commands are for demonstrating single-instance local deployments, not distributed Kubernetes environments. +## Getting Started (Single GPU) -### 1. Start the Unified Observability Stack - -From the `deploy` directory, start the unified observability stack: +### 1. Start Observability Stack ```bash cd deploy @@ -111,11 +53,11 @@ Configure Dynamo components to export traces: ```bash # Enable JSONL logging and tracing export DYN_LOGGING_JSONL=true -export OTEL_EXPORT_ENABLED=1 +export OTEL_EXPORT_ENABLED=true export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317 ``` -### 3a. Quick Start: Single GPU Aggregated Deployment +### 3. Start Dynamo Components (Single GPU) For a simple single-GPU deployment, start the frontend and a single vLLM worker: @@ -133,7 +75,7 @@ wait This runs both prefill and decode on the same GPU, providing a simpler setup for testing tracing. -### 3b. Run vLLM Disaggregated Deployment (2 GPUs) +### Alternative: Disaggregated Deployment (2 GPUs) Run the vLLM disaggregated script with tracing enabled: @@ -154,47 +96,45 @@ trap 'echo Cleaning up...; kill 0' EXIT # Enable tracing export DYN_LOGGING_JSONL=true -export OTEL_EXPORT_ENABLED=1 +export OTEL_EXPORT_ENABLED=true export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317 # Run frontend export OTEL_SERVICE_NAME=dynamo-frontend python -m dynamo.frontend --router-mode kv --http-port=8000 & -# Run decode worker +# Run decode worker, make sure to wait for start up export OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & -# Run prefill worker +# Run prefill worker, make sure to wait for start up export OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ --model Qwen/Qwen3-0.6B \ --enforce-eager \ --is-prefill-worker & - -wait ``` For disaggregated deployments, this separates prefill and decode onto different GPUs for better resource utilization. ### 4. Generate Traces -Send requests to the frontend to generate traces (works for both aggregated and disaggregated deployments): +Send requests to the frontend to generate traces (works for both aggregated and disaggregated deployments). **Note the `x-request-id` header**, which allows you to easily search for and correlate this specific trace in Grafana: ```bash -curl -d '{ +curl -H 'Content-Type: application/json' \ +-H 'x-request-id: test-trace-001' \ +-d '{ "model": "Qwen/Qwen3-0.6B", "max_completion_tokens": 100, "messages": [ {"role": "user", "content": "What is the capital of France?"} ] }' \ --H 'Content-Type: application/json' \ --H 'x-request-id: test-trace-001' \ http://localhost:8000/v1/chat/completions ``` -### 6. View Traces in Grafana Tempo +### 5. View Traces in Grafana Tempo 1. Open Grafana at `http://localhost:3000` 2. Login with username `dynamo` and password `dynamo` @@ -213,7 +153,7 @@ Below is an example of what a trace looks like in Grafana Tempo: ![Trace Example](trace.png) -### 7. Stop Services +### 6. Stop Services When done, stop the observability stack: @@ -243,7 +183,7 @@ spec: - name: DYN_LOGGING_JSONL value: "true" - name: OTEL_EXPORT_ENABLED - value: "1" + value: "true" - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT value: "http://tempo.observability.svc.cluster.local:4317" diff --git a/lib/bindings/python/examples/metrics/README.md b/lib/bindings/python/examples/metrics/README.md index 9e0b810773..9893239ae0 100644 --- a/lib/bindings/python/examples/metrics/README.md +++ b/lib/bindings/python/examples/metrics/README.md @@ -1,425 +1,25 @@ -# Dynamo MetricsRegistry for Python +# Python Metrics Examples -Python MetricsRegistry allows you to create and manage Prometheus metrics from Python: +Example scripts demonstrating how to create and use Prometheus metrics in Python using the Dynamo metrics API. -- **Metric Types**: Counter, IntCounter, Gauge, IntGauge, Histogram, and their Vec variants (CounterVec, IntCounterVec, GaugeVec, IntGaugeVec) -- **Metric Introspection**: Access metric names, constant labels, and variable label names -- **Automatic Registration**: Metrics are automatically registered with the component hierarchy (namespace/component/endpoint) and available on the HTTP system status server -- **Optional Callback Support**: Register Python callbacks to update metrics before scraping +## Documentation -Example: -```python -from dynamo.runtime import DistributedRuntime +See the **[Metrics Developer Guide - Python Section](../../../../docs/observability/metrics-developer-guide.md#python-metrics-api)** for complete documentation. -async def main(): - drt = DistributedRuntime() - endpoint = drt.namespace("ns").component("comp").endpoint("ep") - - # Create metrics - counter = endpoint.metrics.create_intcounter("requests_total", "Total requests") - gauge_vec = endpoint.metrics.create_intgaugevec( - "active_connections", - "Active connections by status", - ["status"], # variable labels - [("region", "us-west")] # constant labels - ) - - # Introspect metrics - print(counter.name()) # "ns_comp_ep_requests_total" - print(counter.const_labels()) # {"dynamo_namespace": "ns", ...} - print(gauge_vec.variable_labels()) # ["status"] - - # Use metrics - counter.inc() - gauge_vec.set(5, {"status": "active"}) -``` - -## Python-Rust Metrics Integration - -This directory demonstrates two methods for passing metrics between Python and Rust in the Dynamo runtime. - -### Method 1: ForwardPassMetrics Pub/Sub via NATS (Legacy method for passing metrics) - -Python maintains its own metrics dictionary, serializes it, and publishes to NATS. Rust subscribes to NATS, deserializes the metrics, and updates Prometheus gauges. - -**Communication pattern**: Unidirectional (Python → NATS → Rust). Python publishes metrics; no feedback from Rust to Python. - -**Example**: Used by `WorkerMetricsPublisher` in production code - -```python -from dynamo.llm import WorkerMetricsPublisher, ForwardPassMetrics - -# Create publisher -publisher = WorkerMetricsPublisher() -await publisher.create_endpoint(component, metrics_labels) - -# Python maintains its own metrics dict -metrics_dict = { - "num_running_reqs": 5, - "num_waiting_reqs": 10, - "gpu_cache_usage": 0.75, -} - -# Serialize and publish to NATS -metrics = ForwardPassMetrics(metrics_dict) -publisher.publish(metrics) - -# Rust subscribes to NATS, deserializes, and updates Prometheus -``` - -### Adding/Changing Metrics in Method 1 - -When you need to add or modify metrics in Method 1 (ForwardPassMetrics Pub/Sub via NATS), you must update **multiple files**: - -1. **`lib/llm/src/kv_router/protocols.rs`** - Add field to struct (WorkerStats is part of ForwardPassMetrics): - ```rust - pub struct WorkerStats { - pub request_active_slots: u64, - pub request_total_slots: u64, - pub num_requests_waiting: u64, - pub new_metric_field: u64, // ADD THIS - } - ``` - -2. **`lib/llm/src/kv_router/publisher.rs`** - Manually create Prometheus gauge using DRT: - ```rust - fn new(component: &Component) -> Result { - use dynamo_runtime::metrics::MetricsRegistry; - - // ... existing gauges ... - - // Manually create and register new Prometheus gauge - let new_metric_gauge = component.metrics().create_gauge( - "new_metric_name", - "Description of new metric", - &[], // labels - )?; - - // Store in struct - Ok(KvStatsPrometheusGauges { - kv_active_blocks_gauge, - kv_total_blocks_gauge, - gpu_cache_usage_gauge, - gpu_prefix_cache_hit_rate_gauge, - new_metric_gauge, // ADD THIS - }) - } - ``` - -3. **`lib/llm/src/kv_router/publisher.rs`** - Update gauge in `update_from_kvstats()`: - ```rust - fn update_from_kvstats(&self, kv_stats: &KvStats) { - // ... existing updates ... - self.new_metric_gauge.set(worker_stats.new_metric_field as f64); - } - ``` - -4. **`components/src/dynamo/sglang/publisher.py`** - Update Python code to compute new metric: - ```python - def collect_metrics(): - worker_stats = WorkerStats( - request_active_slots=..., - new_metric_field=compute_new_metric(), # ADD THIS - ) - ``` - -**Result**: Changes require touching 3-4 files across Rust and Python codebases. - -### Method 2: Dynamo MetricsRegistry in Python - -Python creates typed metric objects using `endpoint.metrics.create_*()` methods, which automatically register with the endpoint. Python updates values through these objects with methods that have type hints (via `.pyi` files). Rust creates the underlying Prometheus metrics and calls Python callbacks before scraping. - -**Communication pattern**: Currently unidirectional (Python → Rust for updates, Rust → Python for callback invocation). Could be extended to bidirectional communication in the future (e.g., Rust notifying Python of scrape events, configuration changes) without major architectural changes. - -**Key advantage:** No Rust code modifications needed - metrics are defined and updated entirely in Python. - -This method supports two update patterns: - -#### Example A: Background Thread Updates (server_with_loop.py) - -Update metrics continuously from a background thread, independent of scraping: - -```python -# Create metric objects (automatically registered) -# Note: Prometheus prefixes these with "dynamo_component_", so they appear as: -# - dynamo_component_request_total_slots -# - dynamo_component_gpu_cache_usage_percent -request_slots: IntGauge = endpoint.metrics.create_intgauge( - "request_total_slots", "Total request slots available" -) -gpu_usage: Gauge = endpoint.metrics.create_gauge( - "gpu_cache_usage_percent", "GPU cache usage percentage" -) - -# Background thread continuously updates metrics -def update_metrics_in_loop(): - count = 0 - while True: - count += 1 - request_slots.set(1024 + count) - gpu_usage.set(0.01 + (count * 0.01)) - time.sleep(2) - -updater = threading.Thread(target=update_metrics_in_loop, daemon=True) -updater.start() -``` - -#### Example B: Callback-based Updates (server_with_callback.py) - -Register a callback that updates metrics on-demand when Prometheus scrapes the `/metrics` endpoint: - -```python -# Create metric objects (automatically registered) -# Note: Prometheus prefixes these with "dynamo_component_", so they appear as: -# - dynamo_component_request_total_slots -# - dynamo_component_gpu_cache_usage_percent -request_slots: IntGauge = endpoint.metrics.create_intgauge( - "request_total_slots", "Total request slots available" -) -gpu_usage: Gauge = endpoint.metrics.create_gauge( - "gpu_cache_usage_percent", "GPU cache usage percentage" -) - -# Register callback for dynamic updates before scraping -def update_metrics(): - request_slots.set(compute_current_slots()) - gpu_usage.set(get_gpu_usage()) - -endpoint.metrics.register_callback(update_metrics) -``` - -Both examples support vector metrics with labels: - -```python -# Create vector metrics with labels -worker_requests: IntGaugeVec = endpoint.metrics.create_intgaugevec( - "worker_active_requests", - "Active requests per worker", - ["worker_id", "model"] -) - -# Update vector metrics with specific label values -worker_requests.set(5, {"worker_id": "worker_1", "model": "llama-3"}) -worker_requests.set(3, {"worker_id": "worker_2", "model": "llama-3"}) -``` - -#### Available Metric Types - -Method 2 supports all standard Prometheus metric types: - -- **Gauges**: `Gauge` (float), `IntGauge` (integer) -- **GaugeVec**: `GaugeVec` (float with labels), `IntGaugeVec` (integer with labels) -- **Counters**: `Counter` (float), `IntCounter` (integer) -- **CounterVec**: `CounterVec` (float with labels), `IntCounterVec` (integer with labels) -- **Histograms**: `Histogram` - -All metrics are imported from `dynamo.prometheus_metrics`. - -#### Adding/Changing Metrics in Method 2 - -When you need to add or modify metrics in Method 2 (Dynamic Registration), you only update **Python code**: - -1. **Create new metric** - Just add one line in Python (automatically registered): - ```python - new_metric: IntGauge = endpoint.metrics.create_intgauge( - "new_metric_name", "Description of the metric" - ) - ``` - -2. **Update in callback** - Add update logic: - ```python - def update_metrics(): - request_slots.set(compute_slots()) - gpu_usage.set(compute_gpu_usage()) - new_metric.set(compute_new_metric()) # ADD THIS - ``` - -3. **For vector metrics with labels** - Create with label names, update with label values: - ```python - # Create vector metric - new_vec: IntGaugeVec = endpoint.metrics.create_intgaugevec( - "new_metric_vec", "Description", ["label1", "label2"] - ) - - # Update with specific label values - new_vec.set(100, {"label1": "value1", "label2": "value2"}) - ``` - -**Result**: Changes only require modifying Python code. No Rust changes needed. Metrics are automatically created and registered with Prometheus by the Rust runtime when you call `create_*()`. - -#### Type-Hinted Methods - -Dynamic Registration provides type hints (via `.pyi` stub files) for typed metric classes: - -- **Gauges** use `.set()`, `.get()`, `.inc()`, `.dec()`, `.add()`, `.sub()` -- **Counters** use `.inc()`, `.inc_by()`, `.get()` (counters only increase) -- **Histograms** use `.observe()` -- **Vec metrics** take a `labels: Dict[str, str]` parameter for operations - -### Architecture Diagrams - -#### Component Architecture - -##### Method 1: ForwardPassMetrics Pub/Sub via NATS - Component View - -```mermaid -graph TB - subgraph "Python Layer" - PY[Python Application
components/src/dynamo/sglang/main.py] - style PY fill:#3776ab,color:#fff - end - - subgraph "Python/Rust Interface (PyO3)" - WMPB[WorkerMetricsPublisher Bindings
bindings/python/rust/llm/kv.rs] - FPM[ForwardPassMetrics Struct
bindings/python/rust/llm/kv.rs] - style WMPB fill:#f4a261,color:#000 - style FPM fill:#f4a261,color:#000 - end - - subgraph "Rust Core" - subgraph "Worker Process Components" - WMP[WorkerMetricsPublisher
llm/src/kv_router/publisher.rs] - WATCH[Watch Channel
tokio::sync::watch] - PROM1[Local Prometheus Gauges
prometheus::Gauge] - end - - subgraph "NATS Infrastructure" - NATS[NATS Server
KV_METRICS_SUBJECT] - end - - subgraph "Other Consumers (e.g., KvWorkerMonitor)" - SUB[NATS Subscriber
component/namespace.rs] - end - - subgraph "System Status Servers" - SS[System Status Server
runtime/src/system_status_server.rs
Started by DistributedRuntime] - end - - style WMP fill:#ce422b,color:#fff - style WATCH fill:#ce422b,color:#fff - style PROM1 fill:#ce422b,color:#fff - style NATS fill:#27aae1,color:#fff - style SUB fill:#ce422b,color:#fff - style SS fill:#6c757d,color:#fff - end - - PY -->|"WorkerMetricsPublisher()"| WMPB - PY -->|"ForwardPassMetrics(worker_stats, kv_stats, spec_decode_stats)"| FPM - PY -->|"publish(metrics)"| WMPB - WMPB -->|"FFI: publish(Arc ForwardPassMetrics)"| WMP - WMP -->|"update_from_kvstats(kv_stats)"| PROM1 - WMP -->|"tx.send(metrics)"| WATCH - WATCH -->|"publish(KV_METRICS_SUBJECT, LoadEvent)"| NATS - NATS -->|"subscribe_with_type LoadEvent"| SUB - SS -->|"Worker: gather() from PROM1"| PROM1 -``` - -##### Method 2: Dynamic Registration - Component View - -```mermaid -graph TD - subgraph Python["Python Layer"] - PY[Python Application
main.py] - style PY fill:#3776ab,color:#fff - end - - subgraph PyO3["Python/Rust Interface - PyO3"] - PM[PrometheusMetricsUtils
endpoint.metrics
prometheus_metrics.rs] - MT[Metric Type Objects
IntGauge/Gauge/Counter/etc.
prometheus_metrics.rs] - style PM fill:#f4a261,color:#000 - style MT fill:#f4a261,color:#000 - end - - subgraph Rust["Rust Core"] - EP[Endpoint
component/endpoint.rs] - DRT[DistributedRuntime
distributed.rs] - PROM["Prometheus Registry
prometheus::IntGauge/Gauge/etc."] - SS[System Status Server
system_status_server.rs] - style EP fill:#ce422b,color:#fff - style DRT fill:#ce422b,color:#fff - style PROM fill:#ce422b,color:#fff - style SS fill:#6c757d,color:#fff - end - - PY -->|endpoint.metrics.create_intgauge| PM - PM -->|endpoint.metrics.create_intgauge| EP - EP -->|create & register| PROM - PM -->|wrap & return| MT - MT -->|return to Python| PY - PY -->|metric.set/get| MT - MT -->|direct FFI call| PROM - PY -.->|endpoint.metrics.register_callback| PM - PM -.->|drt.register_metrics_callback| DRT - SS ==>|execute_metrics_callbacks| DRT - DRT -.->|invoke Python callback| PY - SS -->|gather| PROM - - linkStyle 7 stroke:#ff6b6b,stroke-width:2px - linkStyle 8 stroke:#ff6b6b,stroke-width:2px - linkStyle 9 stroke:#ff6b6b,stroke-width:2px - linkStyle 10 stroke:#ff6b6b,stroke-width:2px -``` - -### Running the Examples - -The examples demonstrate Method 2 (Dynamo MetricsRegistry in Python) with two different update patterns. - -#### Prerequisites - -Update Python bindings if needed: -```bash -cd ~/dynamo/lib/bindings/python -maturin develop -``` - -#### Run Example A: Background Thread Updates +## Running Examples ```bash cd ~/dynamo/lib/bindings/python/examples/metrics -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_loop.py -``` -#### Run Example B: Callback-based Updates +# Background thread updates +DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_loop.py -```bash -cd ~/dynamo/lib/bindings/python/examples/metrics +# Callback-based updates DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_callback.py -``` - -**Note:** The environment variables are required: -- `DYN_SYSTEM_ENABLED=true` - Enables the system status server -- `DYN_SYSTEM_PORT=8081` - Sets the port for the metrics endpoint - -#### Check the Metrics - -The metrics are served via the system status server at: -```bash +# Check Prometheus Exposition Format text metrics curl http://localhost:8081/metrics ``` - -Expected output includes: - -``` -# HELP request_total_slots Total request slots available -# TYPE request_total_slots gauge -request_total_slots{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556"} 1024 - -# HELP gpu_cache_usage_percent GPU cache usage percentage -# TYPE gpu_cache_usage_percent gauge -gpu_cache_usage_percent{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556"} 0.00 - -# HELP worker_active_requests Active requests per worker -# TYPE worker_active_requests gauge -worker_active_requests{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",worker_id="worker_1",model="llama-3"} 5 -worker_active_requests{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",worker_id="worker_2",model="llama-3"} 3 - -# HELP internal_update_count Number of times metrics callback was invoked -# TYPE internal_update_count counter -internal_update_count{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",type="internal"} 1 -``` - -Each time you query the `/metrics` endpoint, the `update_metrics()` callback is invoked, updating the metric values with fresh data. diff --git a/lib/bindings/python/rust/lib.rs b/lib/bindings/python/rust/lib.rs index fc905b45d1..fb9a31108e 100644 --- a/lib/bindings/python/rust/lib.rs +++ b/lib/bindings/python/rust/lib.rs @@ -125,12 +125,9 @@ fn create_request_context( #[pymodule] fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { // Initialize logging early unless OTEL export is enabled (which requires tokio runtime) - if std::env::var("OTEL_EXPORT_ENABLED") - .map(|v| v == "1") - .unwrap_or(false) - { + if rs::config::env_is_truthy("OTEL_EXPORT_ENABLED") { eprintln!( - "Warning: OTEL_EXPORT_ENABLED=1 detected. Logging initialization deferred until runtime is available. Early logs may be dropped." + "Warning: OTEL_EXPORT_ENABLED detected. Logging initialization deferred until runtime is available. Early logs may be dropped." ); } else { rs::logging::init(); @@ -449,10 +446,7 @@ impl DistributedRuntime { // Initialize logging in context where tokio runtime is available // otel exporter requires it - if std::env::var("OTEL_EXPORT_ENABLED") - .map(|v| v == "1") - .unwrap_or(false) - { + if rs::config::env_is_truthy("OTEL_EXPORT_ENABLED") { runtime.secondary().block_on(async { rs::logging::init(); }); diff --git a/lib/runtime/src/logging.rs b/lib/runtime/src/logging.rs index 4250c82f17..a46b043eb8 100644 --- a/lib/runtime/src/logging.rs +++ b/lib/runtime/src/logging.rs @@ -144,11 +144,9 @@ impl Default for LoggingConfig { } } -/// Check if OTLP trace exporting is enabled (set OTEL_EXPORT_ENABLED=1 to enable) +/// Check if OTLP trace exporting is enabled (set OTEL_EXPORT_ENABLED to a truthy value: 1, true, on, yes) fn otlp_exporter_enabled() -> bool { - std::env::var(OTEL_EXPORT_ENABLED_ENV) - .map(|v| v == "1") - .unwrap_or(false) + crate::config::env_is_truthy(OTEL_EXPORT_ENABLED_ENV) } /// Get the service name from environment or use default From 9cd7cf1eb581cf110b1b92d34cc412d72da3238f Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 7 Nov 2025 02:07:40 +0000 Subject: [PATCH 04/17] Add new observability docs to toctree Signed-off-by: Keiven Chang --- docs/_sections/observability.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/_sections/observability.rst b/docs/_sections/observability.rst index f91973e7d1..0bc0c886d5 100644 --- a/docs/_sections/observability.rst +++ b/docs/_sections/observability.rst @@ -4,6 +4,10 @@ Observability .. toctree:: :hidden: + Overview <../observability/README> + Prometheus + Grafana Setup <../observability/prometheus-grafana> Metrics <../observability/metrics> - Logging <../observability/logging> - Health Checks <../observability/health-checks> \ No newline at end of file + Metrics Developer Guide <../observability/metrics-developer-guide> + Health Checks <../observability/health-checks> + Tracing <../observability/tracing> + Logging <../observability/logging> \ No newline at end of file From 8a80eee716f4d60c653f1b230cfe69114962ddee Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 7 Nov 2025 02:16:24 +0000 Subject: [PATCH 05/17] Remove Optional Observability Stack section from README Signed-off-by: Keiven Chang --- README.md | 10 ---------- docs/_sections/observability.rst | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/README.md b/README.md index 5eb6ba557e..9b0cc366be 100644 --- a/README.md +++ b/README.md @@ -106,16 +106,6 @@ To quickly setup etcd & NATS, you can also run: docker compose -f deploy/docker-compose.yml up -d ``` -### Optional: Observability Stack - -For monitoring with metrics (Prometheus, Grafana) and distributed tracing (Tempo), deploy the observability stack: - -```bash -docker compose -f deploy/docker-observability.yml up -d -``` - -This provides GPU metrics (DCGM), NATS metrics, Prometheus, and Grafana dashboards. Access Grafana at `http://localhost:3000` (username: `dynamo`, password: `dynamo`). - ## 2. Select an engine We publish Python wheels specialized for each of our supported engines: vllm, sglang, and trtllm. The examples that follow use SGLang; continue reading for other engines. diff --git a/docs/_sections/observability.rst b/docs/_sections/observability.rst index 0bc0c886d5..c1b108c975 100644 --- a/docs/_sections/observability.rst +++ b/docs/_sections/observability.rst @@ -10,4 +10,4 @@ Observability Metrics Developer Guide <../observability/metrics-developer-guide> Health Checks <../observability/health-checks> Tracing <../observability/tracing> - Logging <../observability/logging> \ No newline at end of file + Logging <../observability/logging> From b6398ef9898f5fb99166aa9d9ac07e05146529b5 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 7 Nov 2025 02:26:49 +0000 Subject: [PATCH 06/17] Fix documentation issues from observability reorganization - Fix docker-compose.yml comment to reference correct filename (docker-observability.yml not docker-compose-observability.yml) - Fix abbreviation formatting in kubernetes logging docs (e.g., etc.) Signed-off-by: Keiven Chang --- deploy/docker-compose.yml | 2 +- docs/kubernetes/observability/logging.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 75b2ff1537..31ded423ae 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # Bare minimum infrastructure services for Dynamo. -# For observability (metrics, tracing, dashboards), use docker-compose-observability.yml +# For observability (metrics, tracing, dashboards), use docker-observability.yml networks: server: diff --git a/docs/kubernetes/observability/logging.md b/docs/kubernetes/observability/logging.md index 68cba5604d..95c0785bef 100644 --- a/docs/kubernetes/observability/logging.md +++ b/docs/kubernetes/observability/logging.md @@ -141,4 +141,4 @@ kubectl port-forward svc/prometheus-grafana 3000:80 -n $MONITORING_NAMESPACE If everything is working, under Home > Dashboards > Dynamo Logs, you should see a dashboard that can be used to view the logs associated with our DynamoGraphDeployments -The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g frontend, worker, etc). +The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g., frontend, worker, etc.). From ef2e5297fb3ebce2fa7745f815a45bacd2555039 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Thu, 6 Nov 2025 18:31:56 -0800 Subject: [PATCH 07/17] Fix broken link anchor in Python metrics README Signed-off-by: Keiven Chang --- lib/bindings/python/examples/metrics/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bindings/python/examples/metrics/README.md b/lib/bindings/python/examples/metrics/README.md index 9893239ae0..f77d8ddff0 100644 --- a/lib/bindings/python/examples/metrics/README.md +++ b/lib/bindings/python/examples/metrics/README.md @@ -7,7 +7,7 @@ Example scripts demonstrating how to create and use Prometheus metrics in Python ## Documentation -See the **[Metrics Developer Guide - Python Section](../../../../docs/observability/metrics-developer-guide.md#python-metrics-api)** for complete documentation. +See the **[Metrics Developer Guide - Python Section](../../../../docs/observability/metrics-developer-guide.md#metrics-api-in-python)** for complete documentation. ## Running Examples From 4c1e06194abdf36f3911c13f03eacb50c072d3a9 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 7 Nov 2025 03:14:56 +0000 Subject: [PATCH 08/17] Fix broken link in Python metrics examples README Correct relative path to metrics-developer-guide.md (needs 6 levels up, not 5) Signed-off-by: Keiven Chang --- lib/bindings/python/examples/metrics/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bindings/python/examples/metrics/README.md b/lib/bindings/python/examples/metrics/README.md index f77d8ddff0..5f83fbe31a 100644 --- a/lib/bindings/python/examples/metrics/README.md +++ b/lib/bindings/python/examples/metrics/README.md @@ -7,7 +7,7 @@ Example scripts demonstrating how to create and use Prometheus metrics in Python ## Documentation -See the **[Metrics Developer Guide - Python Section](../../../../docs/observability/metrics-developer-guide.md#metrics-api-in-python)** for complete documentation. +See the **[Metrics Developer Guide - Python Section](../../../../../docs/observability/metrics-developer-guide.md#metrics-api-in-python)** for complete documentation. ## Running Examples From de7a07aa312d610e666858ee721f7b8cc1f133a5 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 7 Nov 2025 17:47:29 +0000 Subject: [PATCH 09/17] Simplify Grafana dashboard names and improve logging docs - Remove redundant 'grafana-' prefix from dashboard filenames - Remove redundant '-dashboard' suffix from JSON files - Simplify dcgm-metrics.json copyright to 2-line SPDX format - Update all documentation references to new filenames - Add detailed DYN_LOG per-target syntax to observability docs - Clarify DYN_LOG_USE_LOCAL_TZ default timezone (UTC) - Add Dynamo logging variables table to K8s logging docs Signed-off-by: Keiven Chang --- ...board-providers.yml => dashboard-providers.yml} | 0 ...grafana-dcgm-metrics.json => dcgm-metrics.json} | 14 +------------- .../{grafana-dynamo-dashboard.json => dynamo.json} | 0 .../{grafana-kvbm-dashboard.json => kvbm.json} | 0 docs/kubernetes/observability/logging.md | 10 ++++++++++ docs/observability/logging.md | 4 ++-- docs/observability/prometheus-grafana.md | 8 ++++---- 7 files changed, 17 insertions(+), 19 deletions(-) rename deploy/observability/grafana_dashboards/{grafana-dashboard-providers.yml => dashboard-providers.yml} (100%) rename deploy/observability/grafana_dashboards/{grafana-dcgm-metrics.json => dcgm-metrics.json} (96%) rename deploy/observability/grafana_dashboards/{grafana-dynamo-dashboard.json => dynamo.json} (100%) rename deploy/observability/grafana_dashboards/{grafana-kvbm-dashboard.json => kvbm.json} (100%) diff --git a/deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml b/deploy/observability/grafana_dashboards/dashboard-providers.yml similarity index 100% rename from deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml rename to deploy/observability/grafana_dashboards/dashboard-providers.yml diff --git a/deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json b/deploy/observability/grafana_dashboards/dcgm-metrics.json similarity index 96% rename from deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json rename to deploy/observability/grafana_dashboards/dcgm-metrics.json index b662e497bc..e82c827e1b 100644 --- a/deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json +++ b/deploy/observability/grafana_dashboards/dcgm-metrics.json @@ -15,19 +15,7 @@ } ] }, - "copyright": [ - "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.", - "SPDX-License-Identifier: Apache-2.0", - "Licensed under the Apache License, Version 2.0 (the \"License\");", - "you may not use this file except in compliance with the License.", - "You may obtain a copy of the License at", - "http://www.apache.org/licenses/LICENSE-2.0", - "Unless required by applicable law or agreed to in writing, software", - "distributed under the License is distributed on an \"AS IS\" BASIS,", - "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.", - "See the License for the specific language governing permissions and", - "limitations under the License." - ], + "_copyright": "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, diff --git a/deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json b/deploy/observability/grafana_dashboards/dynamo.json similarity index 100% rename from deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json rename to deploy/observability/grafana_dashboards/dynamo.json diff --git a/deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json b/deploy/observability/grafana_dashboards/kvbm.json similarity index 100% rename from deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json rename to deploy/observability/grafana_dashboards/kvbm.json diff --git a/docs/kubernetes/observability/logging.md b/docs/kubernetes/observability/logging.md index 95c0785bef..66b29dd28a 100644 --- a/docs/kubernetes/observability/logging.md +++ b/docs/kubernetes/observability/logging.md @@ -25,6 +25,8 @@ While this guide does not use Prometheus, it assumes Grafana is pre-installed wi ### 3. Environment Variables +#### Kubernetes Setup Variables + The following env variables are set: - `MONITORING_NAMESPACE`: The namespace where Loki is installed - `DYN_NAMESPACE`: The namespace where Dynamo Cloud Operator is installed @@ -34,6 +36,14 @@ export MONITORING_NAMESPACE=monitoring export DYN_NAMESPACE=dynamo-system ``` +#### Dynamo Logging Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for Loki) | `true` | +| `DYN_LOG` | Log levels per target `,=,=` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | +| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps | `true` | + ## Installation Steps ### 1. Install Loki diff --git a/docs/observability/logging.md b/docs/observability/logging.md index 8b811d0649..36c771cbf2 100644 --- a/docs/observability/logging.md +++ b/docs/observability/logging.md @@ -29,8 +29,8 @@ distributed tracing. | Variable | Description | Default | Example | |----------|-------------|---------|---------| | `DYN_LOGGING_JSONL` | Enable JSONL logging format | `false` | `true` | -| `DYN_LOG` | Log level: `info` or `debug` | `info` | `debug` | -| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps | `false` | `true` | +| `DYN_LOG` | Log levels per target `,=,=` | `info` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | +| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps (default is UTC) | `false` | `true` | | `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration | none | `/path/to/config.toml` | | `OTEL_SERVICE_NAME` | Service name for OpenTelemetry traces | `dynamo` | `dynamo-frontend` | | `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting | `false` | `true` | diff --git a/docs/observability/prometheus-grafana.md b/docs/observability/prometheus-grafana.md index 8789f888e4..2a5fdd9092 100644 --- a/docs/observability/prometheus-grafana.md +++ b/docs/observability/prometheus-grafana.md @@ -112,10 +112,10 @@ The following configuration files are located in the `deploy/observability/` dir - [docker-observability.yml](../../deploy/docker-observability.yml): Defines Prometheus, Grafana, Tempo, and exporters - [prometheus.yml](../../deploy/observability/prometheus.yml): Contains Prometheus scraping configuration - [grafana-datasources.yml](../../deploy/observability/grafana-datasources.yml): Contains Grafana datasource configuration -- [grafana_dashboards/grafana-dashboard-providers.yml](../../deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration -- [grafana_dashboards/grafana-dynamo-dashboard.json](../../deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics -- [grafana_dashboards/grafana-dcgm-metrics.json](../../deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics -- [grafana_dashboards/grafana-kvbm-dashboard.json](../../deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics +- [grafana_dashboards/dashboard-providers.yml](../../deploy/observability/grafana_dashboards/dashboard-providers.yml): Contains Grafana dashboard provider configuration +- [grafana_dashboards/dynamo.json](../../deploy/observability/grafana_dashboards/dynamo.json): A general Dynamo Dashboard for both SW and HW metrics +- [grafana_dashboards/dcgm-metrics.json](../../deploy/observability/grafana_dashboards/dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics +- [grafana_dashboards/kvbm.json](../../deploy/observability/grafana_dashboards/kvbm.json): Contains Grafana dashboard configuration for KVBM metrics ### Configuration From 67541d06b14b96927ac6367b60755cc0bd1a007c Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 7 Nov 2025 20:13:59 +0000 Subject: [PATCH 10/17] Reorganize and standardize observability documentation - Centralize docker-compose instructions in README.md - Add topology diagram and configuration files to README.md - Standardize all docs to use 'Getting Started Quickly' heading - Remove duplicate docker-compose commands from individual guides - All guides now reference README.md for observability stack setup - Update DYN_LOG documentation with per-target syntax examples - Clarify DYN_LOG_USE_LOCAL_TZ default timezone (UTC) - Add Dynamo logging variables to K8s logging docs - Remove lib/bindings/python/examples/metrics/README.md - Improve consistency across all observability documentation Signed-off-by: Keiven Chang --- docs/observability/README.md | 68 ++++++++++++++- docs/observability/health-checks.md | 2 +- docs/observability/logging.md | 18 ++-- docs/observability/metrics.md | 11 ++- docs/observability/prometheus-grafana.md | 82 +++---------------- docs/observability/tracing.md | 27 +----- .../python/examples/metrics/README.md | 25 ------ 7 files changed, 103 insertions(+), 130 deletions(-) delete mode 100644 lib/bindings/python/examples/metrics/README.md diff --git a/docs/observability/README.md b/docs/observability/README.md index 12c71c335e..9ecc4d63e4 100644 --- a/docs/observability/README.md +++ b/docs/observability/README.md @@ -5,9 +5,32 @@ SPDX-License-Identifier: Apache-2.0 # Dynamo Observability -## Quick Start +## Getting Started Quickly -For a quick start guide to get Prometheus and Grafana running with Dynamo on a single machine, see [Prometheus + Grafana Setup](prometheus-grafana.md). +This is an example to get started quickly on a single machine. + +### Prerequisites + +Install these on your machine: + +- [Docker](https://docs.docker.com/get-docker/) +- [Docker Compose](https://docs.docker.com/compose/install/) + +### Starting the Observability Stack + +Dynamo provides a Docker Compose-based observability stack that includes Prometheus, Grafana, Tempo, and various exporters for metrics, tracing, and visualization. + +From the Dynamo root directory: + +```bash +# Start infrastructure (NATS, etcd) +docker compose -f deploy/docker-compose.yml up -d + +# Start observability stack (Prometheus, Grafana, Tempo, DCGM GPU exporter, NATS exporter) +docker compose -f deploy/docker-observability.yml up -d +``` + +For detailed setup instructions and configuration, see [Prometheus + Grafana Setup](prometheus-grafana.md). ## Observability Documentations @@ -30,3 +53,44 @@ For a quick start guide to get Prometheus and Grafana running with Dynamo on a s For Kubernetes-specific setup and configuration, see [docs/kubernetes/observability/](../kubernetes/observability/). +--- + +## Topology + +This provides: +- **Prometheus** on `http://localhost:9090` - metrics collection and querying +- **Grafana** on `http://localhost:3000` - visualization dashboards (username: `dynamo`, password: `dynamo`) +- **Tempo** on `http://localhost:3200` - distributed tracing backend +- **DCGM Exporter** on `http://localhost:9401/metrics` - GPU metrics +- **NATS Exporter** on `http://localhost:7777/metrics` - NATS messaging metrics + +### Service Relationship Diagram +```mermaid +graph TD + BROWSER[Browser] -->|:3000| GRAFANA[Grafana :3000] + subgraph DockerComposeNetwork [Network inside Docker Compose] + NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] + PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] + PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] + PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP + PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] + PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] + DYNAMOFE --> DYNAMOBACKEND + GRAFANA -->|:9090/query API| PROMETHEUS + end +``` + +The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM. + +### Configuration Files + +The following configuration files are located in the `deploy/observability/` directory: +- [docker-compose.yml](../../deploy/docker-compose.yml): Defines NATS and etcd services +- [docker-observability.yml](../../deploy/docker-observability.yml): Defines Prometheus, Grafana, Tempo, and exporters +- [prometheus.yml](../../deploy/observability/prometheus.yml): Contains Prometheus scraping configuration +- [grafana-datasources.yml](../../deploy/observability/grafana-datasources.yml): Contains Grafana datasource configuration +- [grafana_dashboards/dashboard-providers.yml](../../deploy/observability/grafana_dashboards/dashboard-providers.yml): Contains Grafana dashboard provider configuration +- [grafana_dashboards/dynamo.json](../../deploy/observability/grafana_dashboards/dynamo.json): A general Dynamo Dashboard for both SW and HW metrics +- [grafana_dashboards/dcgm-metrics.json](../../deploy/observability/grafana_dashboards/dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics +- [grafana_dashboards/kvbm.json](../../deploy/observability/grafana_dashboards/kvbm.json): Contains Grafana dashboard configuration for KVBM metrics + diff --git a/docs/observability/health-checks.md b/docs/observability/health-checks.md index 2213b2bc10..7e8e103003 100644 --- a/docs/observability/health-checks.md +++ b/docs/observability/health-checks.md @@ -22,7 +22,7 @@ orchestration frameworks such as Kubernetes. | `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` | | `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` | -## Getting Started (Single GPU) +## Getting Started Quickly Enable health checks and query endpoints: diff --git a/docs/observability/logging.md b/docs/observability/logging.md index 36c771cbf2..6c0057c991 100644 --- a/docs/observability/logging.md +++ b/docs/observability/logging.md @@ -36,7 +36,13 @@ distributed tracing. | `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting | `false` | `true` | | `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint | `http://localhost:4317` | `http://tempo:4317` | -## Getting Started +## Getting Started Quickly + +### Start Observability Stack + +For collecting and visualizing logs with Grafana Loki (Kubernetes), or viewing trace context in logs alongside Grafana Tempo, start the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for instructions. + +### Enable Structured Logging Enable structured JSONL logging: @@ -99,15 +105,15 @@ Resulting Log format: {"time":"2025-09-02T15:53:31.943747Z","level":"INFO","target":"log","message":"Scheduler config values: {'max_num_seqs': 256, 'max_num_batched_tokens': 2048}","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":268,"log.target":"main.get_engine_cache_info"} ``` -## Logging of OpenTelemetry Tracing +## Logging of Trace and Span IDs -When `DYN_LOGGING_JSONL` is enabled, Dynamo uses OpenTelemetry for distributed tracing. All logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend. +When `DYN_LOGGING_JSONL` is enabled, all logs include OpenTelemetry compatible `trace_id` and `span_id` fields, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend and for correlating log messages with traces. -**Note:** This section has overlap with [Distributed Tracing with Tempo](tracing.md) since OpenTelemetry has aspects of both logging and tracing. For trace visualization in Grafana Tempo and persistent trace analysis, see [Distributed Tracing with Tempo](tracing.md). +**Note:** This section has overlap with [Distributed Tracing with Tempo](tracing.md). For trace visualization in Grafana Tempo and persistent trace analysis, see [Distributed Tracing with Tempo](tracing.md). ### Configuration for Logging -To see OpenTelemetry trace information in logs: +To see trace information in logs: ```bash export DYN_LOGGING_JSONL=true @@ -141,7 +147,7 @@ Check the logs (stderr) for JSONL output containing `trace_id`, `span_id`, and ` ## Trace and Span Information in Logs -This section shows how OpenTelemetry trace and span information appears in JSONL logs. These logs can be used to understand request flows even without a trace visualization backend. +This section shows how trace and span information appears in JSONL logs. These logs can be used to understand request flows even without a trace visualization backend. ### Example Disaggregated Trace in Grafana diff --git a/docs/observability/metrics.md b/docs/observability/metrics.md index e947285545..bc5d8e20b8 100644 --- a/docs/observability/metrics.md +++ b/docs/observability/metrics.md @@ -20,9 +20,16 @@ Dynamo provides built-in metrics capabilities through the Dynamo metrics API, wh | `DYN_SYSTEM_ENABLED` | Enable system metrics/health server | `false` | `true` | | `DYN_SYSTEM_PORT` | System metrics/health port | `8081` | `9090` | -## Getting Started (Single GPU) +## Getting Started Quickly -**Note:** This requires NATS and etcd running. For a complete setup with Prometheus and Grafana visualization, see the [Prometheus and Grafana Setup Guide](prometheus-grafana.md). +This is a single machine example. + +### Start Observability Stack + +For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for instructions. + + +### Launch Dynamo Components Launch a frontend and vLLM backend to test metrics: diff --git a/docs/observability/prometheus-grafana.md b/docs/observability/prometheus-grafana.md index 2a5fdd9092..6684bc2a59 100644 --- a/docs/observability/prometheus-grafana.md +++ b/docs/observability/prometheus-grafana.md @@ -19,26 +19,13 @@ This guide shows how to set up Prometheus and Grafana for visualizing Dynamo met | `DYN_SYSTEM_ENABLED` | Enable system metrics/health server | `false` | `true` | | `DYN_SYSTEM_PORT` | System metrics/health port | `8081` | `9090` | -## Getting Started (Single GPU) +## Getting Started Quickly -### Prerequisites - -Install these on your machine: - -- [Docker](https://docs.docker.com/get-docker/) -- [Docker Compose](https://docs.docker.com/compose/install/) +This is a single machine example. ### Start the Observability Stack -From the Dynamo root directory: - -```bash -# Start infrastructure (NATS, etcd) -docker compose -f deploy/docker-compose.yml up -d - -# Then start observability stack (Prometheus, Grafana, Tempo, DCGM GPU exporter, NATS exporter) -docker compose -f deploy/docker-observability.yml up -d -``` +Start the observability stack (Prometheus, Grafana, Tempo, exporters). See [Observability Getting Started](README.md#getting-started-quickly) for instructions and prerequisites. ### Start Dynamo Components @@ -85,53 +72,17 @@ Other interfaces: --- -## Topology - -Default Service Relationship Diagram: -```mermaid -graph TD - BROWSER[Browser] -->|:3000| GRAFANA[Grafana :3000] - subgraph DockerComposeNetwork [Network inside Docker Compose] - NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] - PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] - PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] - PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP - PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] - PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] - DYNAMOFE --> DYNAMOBACKEND - GRAFANA -->|:9090/query API| PROMETHEUS - end -``` - -The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM. +## Configuration -### Required Files - -The following configuration files are located in the `deploy/observability/` directory: -- [docker-compose.yml](../../deploy/docker-compose.yml): Defines NATS and etcd services -- [docker-observability.yml](../../deploy/docker-observability.yml): Defines Prometheus, Grafana, Tempo, and exporters -- [prometheus.yml](../../deploy/observability/prometheus.yml): Contains Prometheus scraping configuration -- [grafana-datasources.yml](../../deploy/observability/grafana-datasources.yml): Contains Grafana datasource configuration -- [grafana_dashboards/dashboard-providers.yml](../../deploy/observability/grafana_dashboards/dashboard-providers.yml): Contains Grafana dashboard provider configuration -- [grafana_dashboards/dynamo.json](../../deploy/observability/grafana_dashboards/dynamo.json): A general Dynamo Dashboard for both SW and HW metrics -- [grafana_dashboards/dcgm-metrics.json](../../deploy/observability/grafana_dashboards/dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics -- [grafana_dashboards/kvbm.json](../../deploy/observability/grafana_dashboards/kvbm.json): Contains Grafana dashboard configuration for KVBM metrics - -### Configuration - -#### Prometheus +### Prometheus The Prometheus configuration is specified in [prometheus.yml](../../deploy/observability/prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint. Please be aware that you might need to modify the target settings to align with your specific host configuration and network environment. -After making changes to prometheus.yml, restart the Prometheus service: +After making changes to prometheus.yml, restart the Prometheus service. See [Observability Getting Started](README.md#getting-started-quickly) for Docker Compose commands. -```bash -docker compose -f deploy/docker-observability.yml restart prometheus -``` - -#### Grafana +### Grafana Grafana is pre-configured with: - Prometheus datasource @@ -139,27 +90,18 @@ Grafana is pre-configured with: ### Troubleshooting -1. Verify services are running: - ```bash - docker compose -f deploy/docker-observability.yml ps - ``` +1. Verify services are running using `docker compose ps` -2. Check logs: - ```bash - docker compose -f deploy/docker-observability.yml logs prometheus - docker compose -f deploy/docker-observability.yml logs grafana - ``` +2. Check logs using `docker compose logs` 3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection. -4. If you encounter issues with stale data or configuration, stop services and wipe volumes: - ```bash - docker compose -f deploy/docker-observability.yml down -v - docker compose -f deploy/docker-observability.yml up -d - ``` +4. If you encounter issues with stale data or configuration, stop services and wipe volumes using `docker compose down -v` then restart. **Note:** The `-v` flag removes named volumes (grafana-data, tempo-data), which will reset dashboards and stored metrics. +For specific Docker Compose commands, see [Observability Getting Started](README.md#getting-started-quickly). + ## Developer Guide For detailed information on creating custom metrics in Dynamo components, see: diff --git a/docs/observability/tracing.md b/docs/observability/tracing.md index feae4d3f29..a87c2a46c5 100644 --- a/docs/observability/tracing.md +++ b/docs/observability/tracing.md @@ -24,27 +24,11 @@ This guide covers single GPU demo setup using Docker Compose. For Kubernetes dep | `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` | `http://tempo:4317` | | `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo` | `dynamo-frontend` | -## Getting Started (Single GPU) +## Getting Started Quickly ### 1. Start Observability Stack -```bash -cd deploy -docker compose -f docker-observability.yml up -d -``` - -This will start: -- **Tempo** on `http://localhost:3200` (HTTP API) and `localhost:4317` (OTLP gRPC) -- **Prometheus** on `http://localhost:9090` -- **Grafana** on `http://localhost:3000` (username: `dynamo`, password: `dynamo`) -- **DCGM Exporter** on `http://localhost:9401/metrics` (GPU metrics) -- **NATS Exporter** on `http://localhost:7777/metrics` - -Verify services are running: - -```bash -docker compose -f docker-observability.yml ps -``` +Start the observability stack (Prometheus, Grafana, Tempo, exporters). See [Observability Getting Started](README.md#getting-started-quickly) for instructions. ### 2. Set Environment Variables @@ -155,12 +139,7 @@ Below is an example of what a trace looks like in Grafana Tempo: ### 6. Stop Services -When done, stop the observability stack: - -```bash -cd deploy -docker compose -f docker-observability.yml down -``` +When done, stop the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for Docker Compose commands. --- diff --git a/lib/bindings/python/examples/metrics/README.md b/lib/bindings/python/examples/metrics/README.md deleted file mode 100644 index 5f83fbe31a..0000000000 --- a/lib/bindings/python/examples/metrics/README.md +++ /dev/null @@ -1,25 +0,0 @@ - - - -# Python Metrics Examples - -Example scripts demonstrating how to create and use Prometheus metrics in Python using the Dynamo metrics API. - -## Documentation - -See the **[Metrics Developer Guide - Python Section](../../../../../docs/observability/metrics-developer-guide.md#metrics-api-in-python)** for complete documentation. - -## Running Examples - -```bash -cd ~/dynamo/lib/bindings/python/examples/metrics - -# Background thread updates -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_loop.py - -# Callback-based updates -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_callback.py - -# Check Prometheus Exposition Format text metrics -curl http://localhost:8081/metrics -``` From 6cac372d56fe56934c52db535103dc731a4de20f Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 7 Nov 2025 23:33:36 +0000 Subject: [PATCH 11/17] logging to not mention OTEL Signed-off-by: Keiven Chang --- docs/observability/logging.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/observability/logging.md b/docs/observability/logging.md index 6c0057c991..1044d061c6 100644 --- a/docs/observability/logging.md +++ b/docs/observability/logging.md @@ -32,7 +32,7 @@ distributed tracing. | `DYN_LOG` | Log levels per target `,=,=` | `info` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | | `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps (default is UTC) | `false` | `true` | | `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration | none | `/path/to/config.toml` | -| `OTEL_SERVICE_NAME` | Service name for OpenTelemetry traces | `dynamo` | `dynamo-frontend` | +| `OTEL_SERVICE_NAME` | Service name for trace and span information | `dynamo` | `dynamo-frontend` | | `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting | `false` | `true` | | `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint | `http://localhost:4317` | `http://tempo:4317` | @@ -107,7 +107,7 @@ Resulting Log format: ## Logging of Trace and Span IDs -When `DYN_LOGGING_JSONL` is enabled, all logs include OpenTelemetry compatible `trace_id` and `span_id` fields, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend and for correlating log messages with traces. +When `DYN_LOGGING_JSONL` is enabled, all logs include `trace_id` and `span_id` fields that are compatible with OpenTelemetry, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend and for correlating log messages with traces. **Note:** This section has overlap with [Distributed Tracing with Tempo](tracing.md). For trace visualization in Grafana Tempo and persistent trace analysis, see [Distributed Tracing with Tempo](tracing.md). @@ -220,7 +220,7 @@ When viewing the corresponding trace in Grafana, you should be able to see somet The following shows the JSONL logs from the frontend service for the same request. Note the `trace_id` field (`b672ccf48683b392891c5cb4163d4b51`) that correlates all logs for this request, and the `span_id` field that identifies individual operations: ``` -{"time":"2025-10-31T20:52:07.707164Z","level":"INFO","file":"/opt/dynamo/lib/runtime/src/logging.rs","line":806,"target":"dynamo_runtime::logging","message":"OpenTelemetry OTLP export enabled","endpoint":"http://tempo.tm.svc.cluster.local:4317","service":"frontend"} +{"time":"2025-10-31T20:52:07.707164Z","level":"INFO","file":"/opt/dynamo/lib/runtime/src/logging.rs","line":806,"target":"dynamo_runtime::logging","message":"OTLP export enabled","endpoint":"http://tempo.tm.svc.cluster.local:4317","service":"frontend"} ... {"time":"2025-10-31T20:52:10.707164Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} {"time":"2025-10-31T20:52:10.745264Z","level":"DEBUG","file":"/opt/dynamo/lib/llm/src/kv_router/prefill_router.rs","line":232,"target":"dynamo_llm::kv_router::prefill_router","message":"Prefill succeeded, using disaggregated params for decode","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} From 8eabeddb9c4a68759c5ceb3dbb136e40ff45e93c Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Sat, 8 Nov 2025 02:17:04 +0000 Subject: [PATCH 12/17] Fix race condition in test_http_service Add wait_for_service_ready() call to ensure the HTTP service is fully started and listening before sending requests. Without this, the test could fail with 404 errors if requests arrive before the server is ready. This race condition has existed since the test was written, but became more visible after recent changes (e.g. KeyValueStoreManager refactor in Oct 2025). The wait_for_service_ready() helper was added in July 2025 for HTTP disconnect tests but the original test_http_service was never updated to use it. This follows the pattern used by other tests in the same file. Signed-off-by: Keiven Chang --- lib/llm/tests/http-service.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/llm/tests/http-service.rs b/lib/llm/tests/http-service.rs index 04c8589a56..71ed9abc5e 100644 --- a/lib/llm/tests/http-service.rs +++ b/lib/llm/tests/http-service.rs @@ -277,6 +277,9 @@ async fn test_http_service() { let cancel_token = token.clone(); let task = tokio::spawn(async move { service.run(token.clone()).await }); + // Wait for the service to be ready before proceeding + wait_for_service_ready(port).await; + let registry = Registry::new(); // TODO: Shouldn't this test know the card before it registers a model? From ac5a72c051763ba5eb74385acf1b7f8f117881b4 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Sat, 8 Nov 2025 04:01:23 +0000 Subject: [PATCH 13/17] Clarify logging terminology and fix copyright headers - Update logging.md to clarify that trace/span information uses OpenTelemetry format/libraries but doesn't require an OpenTelemetry backend (Tempo/Jaeger) - Standardize copyright headers to 2-line SPDX format across observability docs - Remove full Apache license text from logging.md - Add missing copyright header to prometheus-grafana.md Signed-off-by: Keiven Chang --- docs/observability/logging.md | 16 +++------------- docs/observability/prometheus-grafana.md | 5 +++++ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/docs/observability/logging.md b/docs/observability/logging.md index 330f3bdfdd..4364bbe7e2 100644 --- a/docs/observability/logging.md +++ b/docs/observability/logging.md @@ -1,18 +1,6 @@ # Dynamo Logging @@ -107,7 +95,9 @@ Resulting Log format: ## Logging of Trace and Span IDs -When `DYN_LOGGING_JSONL` is enabled, all logs include `trace_id` and `span_id` fields that are compatible with OpenTelemetry, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend and for correlating log messages with traces. +When `DYN_LOGGING_JSONL` is enabled, all logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend and for correlating log messages with traces. + +The trace and span information uses the OpenTelemetry format and libraries, which means the IDs are compatible with OpenTelemetry-based tracing backends like Tempo or Jaeger if you later choose to enable trace export. **Note:** This section has overlap with [Distributed Tracing with Tempo](tracing.md). For trace visualization in Grafana Tempo and persistent trace analysis, see [Distributed Tracing with Tempo](tracing.md). diff --git a/docs/observability/prometheus-grafana.md b/docs/observability/prometheus-grafana.md index 6684bc2a59..28d717643f 100644 --- a/docs/observability/prometheus-grafana.md +++ b/docs/observability/prometheus-grafana.md @@ -1,3 +1,8 @@ + + # Metrics Visualization with Prometheus and Grafana ## Overview From 24a15fdd0b9e10d7313f9466e6df489ad6eeca44 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Mon, 10 Nov 2025 18:31:19 +0000 Subject: [PATCH 14/17] add back README.md references, and removed unused podmonitor files Signed-off-by: Keiven Chang --- deploy/observability/README.md | 3 + .../grafana_dashboards/README.md | 11 + .../grafana_dashboards/temp-loki.json | 214 ++++++++++++++++++ .../k8s/frontend-podmonitor.yaml | 25 -- .../observability/k8s/planner-podmonitor.yaml | 20 -- .../observability/k8s/worker-podmonitor.yaml | 20 -- 6 files changed, 228 insertions(+), 65 deletions(-) create mode 100644 deploy/observability/README.md create mode 100644 deploy/observability/grafana_dashboards/README.md create mode 100644 deploy/observability/grafana_dashboards/temp-loki.json delete mode 100644 deploy/observability/k8s/frontend-podmonitor.yaml delete mode 100644 deploy/observability/k8s/planner-podmonitor.yaml delete mode 100644 deploy/observability/k8s/worker-podmonitor.yaml diff --git a/deploy/observability/README.md b/deploy/observability/README.md new file mode 100644 index 0000000000..0fb3e7723c --- /dev/null +++ b/deploy/observability/README.md @@ -0,0 +1,3 @@ +# Dynamo Observability + +For detailed documentation on Observability (Prometheus metrics, tracing, and logging), please refer to [docs/observability/](../../docs/observability/). diff --git a/deploy/observability/grafana_dashboards/README.md b/deploy/observability/grafana_dashboards/README.md new file mode 100644 index 0000000000..7eaeb16808 --- /dev/null +++ b/deploy/observability/grafana_dashboards/README.md @@ -0,0 +1,11 @@ +# Example Grafana Dashboards + +This directory contains example Grafana dashboards for Dynamo observability. These are starter files that you can use as references for building your own custom dashboards. + +- `dynamo.json` - General Dynamo dashboard showing software and hardware metrics +- `dcgm-metrics.json` - GPU metrics dashboard using DCGM exporter data +- `kvbm.json` - KV Block Manager metrics dashboard +- `temp-loki.json` - Logging dashboard for Loki integration +- `dashboard-providers.yml` - Configuration file for dashboard provisioning + +For setup instructions and usage, see [Observability Documentation](../../../docs/observability/). diff --git a/deploy/observability/grafana_dashboards/temp-loki.json b/deploy/observability/grafana_dashboards/temp-loki.json new file mode 100644 index 0000000000..04f27cd250 --- /dev/null +++ b/deploy/observability/grafana_dashboards/temp-loki.json @@ -0,0 +1,214 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 21, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "direction": "backward", + "editorMode": "builder", + "expr": "{namespace=~\"$namespace\", nvidia_com_dynamo_graph_deployment_name=~\"$dynamographdeployment\", nvidia_com_dynamo_component_type=~\"$component\"} |= \"$search\" |= \"$trace_id\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "DynamoGraph Logs", + "type": "logs" + } + ], + "preload": false, + "schemaVersion": 41, + "tags": ["dynamograph", "logs"], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "Loki", + "value": "Loki" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "loki", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "definition": "label_values(namespace)", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": "label_values(namespace)", + "refresh": 1, + "regex": ".+", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "definition": "label_values(nvidia_com_dynamo_graph_deployment_name)", + "hide": 0, + "includeAll": true, + "label": "DynamoGraph Deployment", + "multi": true, + "name": "dynamographdeployment", + "options": [], + "query": "label_values(nvidia_com_dynamo_graph_deployment_name)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "loki", + "uid": "$datasource" + }, + "definition": "label_values(nvidia_com_dynamo_component_type)", + "hide": 0, + "includeAll": true, + "label": "Component", + "multi": true, + "name": "component", + "options": [], + "query": "label_values(nvidia_com_dynamo_component_type)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "", + "value": "" + }, + "label": "Trace ID", + "name": "trace_id", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + }, + { + "current": { + "selected": true, + "text": "", + "value": "" + }, + "label": "Search", + "name": "search", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "DynamoGraph Logs", + "description": "Dashboard for viewing DynamoGraph deployment logs across components and namespaces", + "version": 1 + } diff --git a/deploy/observability/k8s/frontend-podmonitor.yaml b/deploy/observability/k8s/frontend-podmonitor.yaml deleted file mode 100644 index c7560797dc..0000000000 --- a/deploy/observability/k8s/frontend-podmonitor.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: dynamo-frontend-metrics - namespace: ${NAMESPACE} -spec: - selector: - matchLabels: - nvidia.com/metrics-enabled: "true" - nvidia.com/dynamo-component-type: "frontend" - podMetricsEndpoints: - - port: http - path: /metrics - interval: 5s - relabelings: - - action: replace - sourceLabels: - - __meta_kubernetes_pod_label_nvidia_com_dynamo_namespace - targetLabel: dynamo_namespace - namespaceSelector: - matchNames: - - ${NAMESPACE} diff --git a/deploy/observability/k8s/planner-podmonitor.yaml b/deploy/observability/k8s/planner-podmonitor.yaml deleted file mode 100644 index 15f2e90a96..0000000000 --- a/deploy/observability/k8s/planner-podmonitor.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: dynamo-planner-metrics - namespace: $NAMESPACE -spec: - selector: - matchLabels: - nvidia.com/metrics-enabled: "true" - nvidia.com/dynamo-component-type: "planner" - podMetricsEndpoints: - - port: metrics - path: /metrics - interval: 5s - namespaceSelector: - matchNames: - - $NAMESPACE \ No newline at end of file diff --git a/deploy/observability/k8s/worker-podmonitor.yaml b/deploy/observability/k8s/worker-podmonitor.yaml deleted file mode 100644 index 1fb44cbbc5..0000000000 --- a/deploy/observability/k8s/worker-podmonitor.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: dynamo-worker-metrics - namespace: ${NAMESPACE} -spec: - selector: - matchLabels: - nvidia.com/metrics-enabled: "true" - nvidia.com/dynamo-component-type: "worker" - podMetricsEndpoints: - - port: system - path: /metrics - interval: 5s - namespaceSelector: - matchNames: - - ${NAMESPACE} From d53444c144e3e4bf6bf2cfe387e92ed80f05bc49 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Mon, 10 Nov 2025 18:38:36 +0000 Subject: [PATCH 15/17] Remove remaining DYN_SYSTEM_ENABLED variables Signed-off-by: Keiven Chang --- components/src/dynamo/sglang/publisher.py | 2 +- docs/observability/README.md | 6 +++--- docs/observability/health-checks.md | 9 +++------ docs/observability/metrics-developer-guide.md | 11 +++++------ docs/observability/metrics.md | 10 +++------- docs/observability/prometheus-grafana.md | 8 +++----- examples/backends/sglang/launch/agg.sh | 2 +- examples/backends/sglang/launch/disagg_same_gpu.sh | 4 ++-- examples/backends/trtllm/launch/disagg_same_gpu.sh | 4 ++-- examples/backends/vllm/launch/agg_multimodal.sh | 2 +- lib/runtime/examples/system_metrics/README.md | 4 ++-- .../examples/system_metrics/tests/integration_test.rs | 5 ++--- tests/fault_tolerance/etcd_ha/test_sglang.py | 1 - tests/fault_tolerance/etcd_ha/test_trtllm.py | 1 - tests/fault_tolerance/etcd_ha/test_vllm.py | 1 - 15 files changed, 28 insertions(+), 42 deletions(-) diff --git a/components/src/dynamo/sglang/publisher.py b/components/src/dynamo/sglang/publisher.py index 5f24ba26a9..986ffc10e0 100644 --- a/components/src/dynamo/sglang/publisher.py +++ b/components/src/dynamo/sglang/publisher.py @@ -204,7 +204,7 @@ def setup_prometheus_registry( SGLang uses multiprocess architecture where metrics are stored in shared memory. MultiProcessCollector aggregates metrics from all worker processes. The Prometheus registry collects sglang:* metrics which are exposed via the metrics server endpoint - (typically port 8081) when DYN_SYSTEM_ENABLED=true. + (set DYN_SYSTEM_PORT to a positive value to enable, e.g., DYN_SYSTEM_PORT=8081). Args: engine: The SGLang engine instance. diff --git a/docs/observability/README.md b/docs/observability/README.md index 9ecc4d63e4..e802f9b10c 100644 --- a/docs/observability/README.md +++ b/docs/observability/README.md @@ -36,8 +36,8 @@ For detailed setup instructions and configuration, see [Prometheus + Grafana Set | Guide | Description | Environment Variables to Control | |-------|-------------|----------------------------------| -| [Metrics](metrics.md) | Available metrics reference | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`† | -| [Health Checks](health-checks.md) | Component health monitoring and readiness probes | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`†, `DYN_SYSTEM_STARTING_HEALTH_STATUS`, `DYN_SYSTEM_HEALTH_PATH`, `DYN_SYSTEM_LIVE_PATH`, `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | +| [Metrics](metrics.md) | Available metrics reference | `DYN_SYSTEM_PORT`† | +| [Health Checks](health-checks.md) | Component health monitoring and readiness probes | `DYN_SYSTEM_PORT`†, `DYN_SYSTEM_STARTING_HEALTH_STATUS`, `DYN_SYSTEM_HEALTH_PATH`, `DYN_SYSTEM_LIVE_PATH`, `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | | [Tracing](tracing.md) | Distributed tracing with OpenTelemetry and Tempo | `DYN_LOGGING_JSONL`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`†, `OTEL_SERVICE_NAME`† | | [Logging](logging.md) | Structured logging configuration | `DYN_LOGGING_JSONL`†, `DYN_LOG`, `DYN_LOG_USE_LOCAL_TZ`, `DYN_LOGGING_CONFIG_PATH`, `OTEL_SERVICE_NAME`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`† | @@ -47,7 +47,7 @@ For detailed setup instructions and configuration, see [Prometheus + Grafana Set | Guide | Description | Environment Variables to Control | |-------|-------------|----------------------------------| -| [Metrics Developer Guide](metrics-developer-guide.md) | Creating custom metrics in Rust and Python | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`† | +| [Metrics Developer Guide](metrics-developer-guide.md) | Creating custom metrics in Rust and Python | `DYN_SYSTEM_PORT`† | ## Kubernetes diff --git a/docs/observability/health-checks.md b/docs/observability/health-checks.md index 61b9cafa84..895b67a474 100644 --- a/docs/observability/health-checks.md +++ b/docs/observability/health-checks.md @@ -15,7 +15,6 @@ orchestration frameworks such as Kubernetes. | Variable | Description | Default | Example | |----------|-------------|---------|---------| -| `DYN_SYSTEM_ENABLED` | Enable system status server | `false` | `true` | | `DYN_SYSTEM_PORT` | System status server port | `8081` | `9090` | | `DYN_SYSTEM_STARTING_HEALTH_STATUS` | Initial health status | `notready` | `ready`, `notready` | | `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` | @@ -27,13 +26,11 @@ orchestration frameworks such as Kubernetes. Enable health checks and query endpoints: ```bash -# Enable system status server -export DYN_SYSTEM_ENABLED=true -export DYN_SYSTEM_PORT=8081 - # Start your Dynamo components python -m dynamo.frontend --http-port 8000 & -python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & + +# Enable system status server on port 8081 +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & ``` Check health status: diff --git a/docs/observability/metrics-developer-guide.md b/docs/observability/metrics-developer-guide.md index c07d235751..036f4bd401 100644 --- a/docs/observability/metrics-developer-guide.md +++ b/docs/observability/metrics-developer-guide.md @@ -9,14 +9,13 @@ This guide explains how to create and use custom metrics in Dynamo components us ## Metrics Exposure -All metrics created via the Dynamo metrics API are automatically exposed on the `/metrics` HTTP endpoint in Prometheus Exposition Format text when the following environment variables are set: +All metrics created via the Dynamo metrics API are automatically exposed on the `/metrics` HTTP endpoint in Prometheus Exposition Format text when the following environment variable is set: -- `DYN_SYSTEM_ENABLED=true` - Enable the system metrics server -- `DYN_SYSTEM_PORT=` - Port for the metrics endpoint (default: `8081`) +- `DYN_SYSTEM_PORT=` - Port for the metrics endpoint (set to positive value to enable, default: `-1` disabled) Example: ```bash -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model ``` Prometheus Exposition Format text metrics will be available at: `http://localhost:8081/metrics` @@ -255,8 +254,8 @@ Example scripts: [lib/bindings/python/examples/metrics/](../../lib/bindings/pyth ```bash cd ~/dynamo/lib/bindings/python/examples/metrics -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_loop.py -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_callback.py +DYN_SYSTEM_PORT=8081 ./server_with_loop.py +DYN_SYSTEM_PORT=8081 ./server_with_callback.py ``` --- diff --git a/docs/observability/metrics.md b/docs/observability/metrics.md index bc5d8e20b8..325457fbc6 100644 --- a/docs/observability/metrics.md +++ b/docs/observability/metrics.md @@ -17,8 +17,7 @@ Dynamo provides built-in metrics capabilities through the Dynamo metrics API, wh | Variable | Description | Default | Example | |----------|-------------|---------|---------| -| `DYN_SYSTEM_ENABLED` | Enable system metrics/health server | `false` | `true` | -| `DYN_SYSTEM_PORT` | System metrics/health port | `8081` | `9090` | +| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` | ## Getting Started Quickly @@ -36,11 +35,8 @@ Launch a frontend and vLLM backend to test metrics: ```bash $ python -m dynamo.frontend --http-port 8000 -# Enable system metrics server -export DYN_SYSTEM_ENABLED=true -export DYN_SYSTEM_PORT=8081 - -$ python -m dynamo.vllm --model Qwen/Qwen3-0.6B \ +# Enable system metrics server on port 8081 +$ DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B \ --enforce-eager --no-enable-prefix-caching --max-num-seqs 3 ``` diff --git a/docs/observability/prometheus-grafana.md b/docs/observability/prometheus-grafana.md index 28d717643f..949273e537 100644 --- a/docs/observability/prometheus-grafana.md +++ b/docs/observability/prometheus-grafana.md @@ -21,8 +21,7 @@ This guide shows how to set up Prometheus and Grafana for visualizing Dynamo met | Variable | Description | Default | Example | |----------|-------------|---------|---------| -| `DYN_SYSTEM_ENABLED` | Enable system metrics/health server | `false` | `true` | -| `DYN_SYSTEM_PORT` | System metrics/health port | `8081` | `9090` | +| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` | ## Getting Started Quickly @@ -40,9 +39,8 @@ Start frontend and worker (a simple single GPU example): # Start frontend in one process python -m dynamo.frontend --http-port 8000 & -# Start vLLM worker with metrics enabled in another process -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ - python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager +# Start vLLM worker with metrics enabled on port 8081 +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager ``` After the workers are running, send a few test requests to populate metrics in the system: diff --git a/examples/backends/sglang/launch/agg.sh b/examples/backends/sglang/launch/agg.sh index ca51036081..feb63d0362 100755 --- a/examples/backends/sglang/launch/agg.sh +++ b/examples/backends/sglang/launch/agg.sh @@ -17,7 +17,7 @@ python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! # run worker with metrics enabled -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=8081 \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ diff --git a/examples/backends/sglang/launch/disagg_same_gpu.sh b/examples/backends/sglang/launch/disagg_same_gpu.sh index 555970bd9a..ba309e56a7 100755 --- a/examples/backends/sglang/launch/disagg_same_gpu.sh +++ b/examples/backends/sglang/launch/disagg_same_gpu.sh @@ -41,7 +41,7 @@ python3 -m dynamo.frontend --router-mode kv --http-port=8000 & DYNAMO_PID=$! # run prefill worker with metrics on port 8081 -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=8081 \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ @@ -71,7 +71,7 @@ echo "Waiting for prefill worker to initialize..." sleep 5 # run decode worker with metrics on port 8082 (foreground) -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \ +DYN_SYSTEM_PORT=8082 \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ diff --git a/examples/backends/trtllm/launch/disagg_same_gpu.sh b/examples/backends/trtllm/launch/disagg_same_gpu.sh index 695b32b637..1036329e8d 100755 --- a/examples/backends/trtllm/launch/disagg_same_gpu.sh +++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh @@ -53,7 +53,7 @@ DYNAMO_PID=$! # run prefill worker (shares GPU with decode) CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=8081 \ python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ @@ -65,7 +65,7 @@ PREFILL_PID=$! # run decode worker (shares GPU with prefill) CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \ +DYN_SYSTEM_PORT=8082 \ python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ diff --git a/examples/backends/vllm/launch/agg_multimodal.sh b/examples/backends/vllm/launch/agg_multimodal.sh index c1a667b686..51ada957f6 100755 --- a/examples/backends/vllm/launch/agg_multimodal.sh +++ b/examples/backends/vllm/launch/agg_multimodal.sh @@ -52,7 +52,7 @@ fi # Multimodal data (images) are decoded in the backend worker using ImageLoader # --enforce-eager: Quick deployment (remove for production) # --connector none: No KV transfer needed for aggregated serving -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=8081 \ python -m dynamo.vllm --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS # Wait for all background processes to complete diff --git a/lib/runtime/examples/system_metrics/README.md b/lib/runtime/examples/system_metrics/README.md index dfbd4291d0..bc47cce0cd 100644 --- a/lib/runtime/examples/system_metrics/README.md +++ b/lib/runtime/examples/system_metrics/README.md @@ -180,7 +180,7 @@ if enable_custom_metrics { ```bash # Run the system metrics example -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 cargo run --bin system_server +DYN_SYSTEM_PORT=8081 cargo run --bin system_server ``` The server will start an system status server on the specified port (8081 in this example) that exposes the Prometheus metrics endpoint at `/metrics`. @@ -189,7 +189,7 @@ To Run an actual LLM frontend + server (aggregated example), launch both of them ``` python -m dynamo.frontend & -DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching & +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching & ``` Then make curl requests to the frontend (see the [main README](../../../../README.md)) diff --git a/lib/runtime/examples/system_metrics/tests/integration_test.rs b/lib/runtime/examples/system_metrics/tests/integration_test.rs index 3568b37abb..d536bd7a74 100644 --- a/lib/runtime/examples/system_metrics/tests/integration_test.rs +++ b/lib/runtime/examples/system_metrics/tests/integration_test.rs @@ -15,8 +15,7 @@ use tokio::time::{Duration, sleep}; #[tokio::test] async fn test_backend_with_metrics() -> Result<()> { - // Set environment variables for dynamic port allocation - env::set_var("DYN_SYSTEM_ENABLED", "true"); + // Set environment variable for dynamic port allocation (0 = auto-assign) env::set_var("DYN_SYSTEM_PORT", "0"); // Generate a random endpoint name to avoid collisions @@ -39,7 +38,7 @@ async fn test_backend_with_metrics() -> Result<()> { } None => { panic!( - "System status server not started - check DYN_SYSTEM_ENABLED environment variable" + "System status server not started - check DYN_SYSTEM_PORT environment variable" ); } }; diff --git a/tests/fault_tolerance/etcd_ha/test_sglang.py b/tests/fault_tolerance/etcd_ha/test_sglang.py index 8783a0fb6d..f4c099bcf3 100644 --- a/tests/fault_tolerance/etcd_ha/test_sglang.py +++ b/tests/fault_tolerance/etcd_ha/test_sglang.py @@ -88,7 +88,6 @@ def __init__(self, request, etcd_endpoints: list, mode: str = "agg"): env = os.environ.copy() env["DYN_LOG"] = "debug" env["ETCD_ENDPOINTS"] = ",".join(etcd_endpoints) - env["DYN_SYSTEM_ENABLED"] = "true" env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_PORT"] = port diff --git a/tests/fault_tolerance/etcd_ha/test_trtllm.py b/tests/fault_tolerance/etcd_ha/test_trtllm.py index 67e839aeb5..330c4071df 100644 --- a/tests/fault_tolerance/etcd_ha/test_trtllm.py +++ b/tests/fault_tolerance/etcd_ha/test_trtllm.py @@ -88,7 +88,6 @@ def __init__( env = os.environ.copy() env["DYN_LOG"] = "debug" env["ETCD_ENDPOINTS"] = ",".join(etcd_endpoints) - env["DYN_SYSTEM_ENABLED"] = "true" env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_PORT"] = port diff --git a/tests/fault_tolerance/etcd_ha/test_vllm.py b/tests/fault_tolerance/etcd_ha/test_vllm.py index 2e58fdfe4b..d286444221 100644 --- a/tests/fault_tolerance/etcd_ha/test_vllm.py +++ b/tests/fault_tolerance/etcd_ha/test_vllm.py @@ -60,7 +60,6 @@ def __init__(self, request, etcd_endpoints: list, is_prefill: bool = False): env = os.environ.copy() env["DYN_LOG"] = "debug" env["ETCD_ENDPOINTS"] = ",".join(etcd_endpoints) - env["DYN_SYSTEM_ENABLED"] = "true" env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_PORT"] = port From db37a76b8d2d11cdf2059bf5ef73572c79d8abe6 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Mon, 10 Nov 2025 18:51:36 +0000 Subject: [PATCH 16/17] cargo fmt Signed-off-by: Keiven Chang --- lib/runtime/examples/system_metrics/tests/integration_test.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/runtime/examples/system_metrics/tests/integration_test.rs b/lib/runtime/examples/system_metrics/tests/integration_test.rs index d536bd7a74..8f78178265 100644 --- a/lib/runtime/examples/system_metrics/tests/integration_test.rs +++ b/lib/runtime/examples/system_metrics/tests/integration_test.rs @@ -37,9 +37,7 @@ async fn test_backend_with_metrics() -> Result<()> { info.port() } None => { - panic!( - "System status server not started - check DYN_SYSTEM_PORT environment variable" - ); + panic!("System status server not started - check DYN_SYSTEM_PORT environment variable"); } }; From c00d419407f36c433380906a9c502d0d6f54b5eb Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Mon, 10 Nov 2025 11:03:07 -0800 Subject: [PATCH 17/17] Add dynamomodel-guide.md to hidden toctree --- docs/hidden_toctree.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/hidden_toctree.rst b/docs/hidden_toctree.rst index e547694d3d..fdc52598e2 100644 --- a/docs/hidden_toctree.rst +++ b/docs/hidden_toctree.rst @@ -26,6 +26,7 @@ kubernetes/api_reference.md kubernetes/deployment/create_deployment.md + kubernetes/deployment/dynamomodel-guide.md kubernetes/fluxcd.md kubernetes/grove.md