ai-dynamo · keivenchang · Nov 10, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 7, 2025
diff --git a/README.md b/README.md
@@ -101,9 +101,8 @@ To coordinate across a data center, Dynamo relies on etcd and NATS. To run Dynam
 
 To quickly setup etcd & NATS, you can also run:
 
-```
+```bash
 # At the root of the repository:
-# Edit deploy/docker-compose.yml to comment out "runtime: nvidia" of the dcgm-exporter service if the nvidia container runtime isn't deployed or to be used.
 docker compose -f deploy/docker-compose.yml up -d
 ```
 

@@ -204,7 +204,7 @@ def setup_prometheus_registry(
     SGLang uses multiprocess architecture where metrics are stored in shared memory.
     MultiProcessCollector aggregates metrics from all worker processes. The Prometheus
     registry collects sglang:* metrics which are exposed via the metrics server endpoint
-    (typically port 8081) when DYN_SYSTEM_ENABLED=true.
+    (set DYN_SYSTEM_PORT to a positive value to enable, e.g., DYN_SYSTEM_PORT=8081).
 
     Args:
         engine: The SGLang engine instance.

@@ -1,26 +1,13 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml
+# Bare minimum infrastructure services for Dynamo.
+# For observability (metrics, tracing, dashboards), use docker-observability.yml
+
 networks:
   server:
     driver: bridge
-  monitoring:
-    driver: bridge
 
-# Note that the images are pinned to specific versions to avoid breaking changes.
 services:
   nats-server:
     image: nats:2.11.4
@@ -31,7 +18,6 @@ services:
       - 8222:8222  # the endpoints include /varz, /healthz, ...
     networks:
       - server
-      - monitoring
 
   etcd-server:
     image: bitnamilegacy/etcd:3.6.1
@@ -42,108 +28,3 @@ services:
       - 2380:2380
     networks:
       - server
-      - monitoring
-
-  # All the services below are part of the metrics profile and monitoring network.
-
-  # The exporter translates from /varz and other stats to Prometheus metrics
-  nats-prometheus-exporter:
-    image: natsio/prometheus-nats-exporter:0.17.3
-    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
-    ports:
-      - 7777:7777
-    networks:
-      - monitoring
-    profiles: [metrics]
-    depends_on:
-      - nats-server
-
-  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
-  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
-  dcgm-exporter:
-    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
-    ports:
-      # Expose dcgm-exporter on port 9401 both inside and outside the container
-      # to avoid conflicts with other dcgm-exporter instances in distributed environments.
-      # To access DCGM metrics:
-      # Outside the container: curl http://localhost:9401/metrics (or the host IP)
-      # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
-      - 9401:9401
-    cap_add:
-      - SYS_ADMIN
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: all
-              capabilities: [gpu]
-    environment:
-      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
-      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
-      - DCGM_EXPORTER_LISTEN=:9401
-    runtime: nvidia  # Specify the NVIDIA runtime
-    networks:
-      - monitoring
-
-  # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
-  # sudo ufw allow 9090/tcp
-  prometheus:
-    image: prom/prometheus:v3.4.1
-    container_name: prometheus
-    volumes:
-      - ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml
-    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-      - '--storage.tsdb.path=/prometheus'
-      # These provide the web console functionality
-      - '--web.console.libraries=/etc/prometheus/console_libraries'
-      - '--web.console.templates=/etc/prometheus/consoles'
-      - '--web.enable-lifecycle'
-    restart: unless-stopped
-    # Example to pull from the /query endpoint:
-    # {__name__=~"DCGM.*", job="dcgm-exporter"}
-    networks:
-      - monitoring
-    ports:
-      - "9090:9090"
-    profiles: [metrics]
-    extra_hosts:
-    - "host.docker.internal:host-gateway"
-    depends_on:
-      - dcgm-exporter
-      - nats-prometheus-exporter
-      - etcd-server
-
-  # grafana connects to prometheus via the /query endpoint.
-  # Default credentials are dynamo/dynamo.
-  # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
-  # sudo ufw allow 3001/tcp
-  grafana:
-    image: grafana/grafana-enterprise:12.0.1
-    container_name: grafana
-    volumes:
-      - ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards
-      - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
-    environment:
-      - GF_SERVER_HTTP_PORT=3001
-      # do not make it admin/admin, because you will be prompted to change the password every time
-      - GF_SECURITY_ADMIN_USER=dynamo
-      - GF_SECURITY_ADMIN_PASSWORD=dynamo
-      - GF_USERS_ALLOW_SIGN_UP=false
-      - GF_INSTALL_PLUGINS=grafana-piechart-panel
-      # Default min interval is 5s, but can be configured lower
-      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
-      # Disable password change requirement
-      - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
-      - GF_SECURITY_ADMIN_PASSWORD_POLICY=false
-      - GF_AUTH_DISABLE_LOGIN_FORM=false
-      - GF_AUTH_DISABLE_SIGNOUT_MENU=false
-    restart: unless-stopped
-    ports:
-      - "3001:3001"
-    networks:
-      - monitoring
-    profiles: [metrics]
-    depends_on:
-      - prometheus
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Observability stack for Dynamo: metrics, tracing, and visualization.
+# Requires deploy/docker-compose.yml to be running for NATS and etcd connectivity.
+#
+# Usage:
+#   docker compose -f deploy/docker-observability.yml up -d
+
+version: '3.8'
+
+networks:
+  server:
+    external: true
+    name: deploy_server
+
+volumes:
+  grafana-data:
+  tempo-data:
+
+services:
+  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
+  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
+  dcgm-exporter:
+    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
+    ports:
+      # Expose dcgm-exporter on port 9401 both inside and outside the container
+      # to avoid conflicts with other dcgm-exporter instances in distributed environments.
+      # To access DCGM metrics:
+      # Outside the container: curl http://localhost:9401/metrics (or the host IP)
+      # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
+      - 9401:9401
+    cap_add:
+      - SYS_ADMIN
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
+      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
+      - DCGM_EXPORTER_LISTEN=:9401
+    runtime: nvidia  # Specify the NVIDIA runtime
+    networks:
+      - server
+
+  # The exporter translates from /varz and other stats to Prometheus metrics
+  nats-prometheus-exporter:
+    image: natsio/prometheus-nats-exporter:0.17.3
+    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
+    ports:
+      - 7777:7777
+    networks:
+      - server
+
+  # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
+  # sudo ufw allow 9090/tcp
+  prometheus:
+    image: prom/prometheus:v3.4.1
+    container_name: prometheus
+    volumes:
+      - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      # These provide the web console functionality
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    # Example to pull from the /query endpoint:
+    # {__name__=~"DCGM.*", job="dcgm-exporter"}
+    ports:
+      - "9090:9090"
+    networks:
+      - server
+    extra_hosts:
+    - "host.docker.internal:host-gateway"
+    depends_on:
+      - dcgm-exporter
+      - nats-prometheus-exporter
+
+  # Tempo - Distributed tracing backend
+  tempo:
+    image: grafana/tempo:2.8.2
+    command: [ "-config.file=/etc/tempo.yaml" ]
+    user: root
+    volumes:
+      - ./observability/tempo.yaml:/etc/tempo.yaml
+      - tempo-data:/tmp/tempo
+    ports:
+      - "3200:3200"   # Tempo HTTP
+      - "4317:4317"   # OTLP gRPC receiver (accessible from host)
+      - "4318:4318"   # OTLP HTTP receiver (accessible from host)
+    networks:
+      - server
+
+  # Grafana - Visualization and dashboards
+  # Supports both Prometheus (metrics) and Tempo (tracing) datasources
+  # Default credentials: dynamo/dynamo
+  # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
+  # sudo ufw allow 3000/tcp
+  grafana:
+    image: grafana/grafana:12.2.0
+    container_name: grafana
+    volumes:
+      - grafana-data:/var/lib/grafana
+      - ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards
+      - ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml
+      - ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml
+    environment:
+      - GF_SERVER_HTTP_PORT=3000
+      # do not make it admin/admin, because you will be prompted to change the password every time
+      - GF_SECURITY_ADMIN_USER=dynamo
+      - GF_SECURITY_ADMIN_PASSWORD=dynamo
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
+      - GF_INSTALL_PLUGINS=grafana-piechart-panel
+      # Default min interval is 5s, but can be configured lower
+      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
+      # Disable password change requirement
+      - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
+      - GF_SECURITY_ADMIN_PASSWORD_POLICY=false
+      - GF_AUTH_DISABLE_LOGIN_FORM=false
+      - GF_AUTH_DISABLE_SIGNOUT_MENU=false
+    restart: unless-stopped
+    ports:
+      - "3000:3000"
+    networks:
+      - server
+    depends_on:
+      - prometheus
+      - tempo
+
@@ -0,0 +1,3 @@
+# Dynamo Observability
+
+For detailed documentation on Observability (Prometheus metrics, tracing, and logging), please refer to [docs/observability/](../../docs/observability/).
@@ -0,0 +1,11 @@
+# Example Grafana Dashboards
+
+This directory contains example Grafana dashboards for Dynamo observability. These are starter files that you can use as references for building your own custom dashboards.
+
+- `dynamo.json` - General Dynamo dashboard showing software and hardware metrics
+- `dcgm-metrics.json` - GPU metrics dashboard using DCGM exporter data
+- `kvbm.json` - KV Block Manager metrics dashboard
+- `temp-loki.json` - Logging dashboard for Loki integration
+- `dashboard-providers.yml` - Configuration file for dashboard provisioning
+
+For setup instructions and usage, see [Observability Documentation](../../../docs/observability/).
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Dynamo Observability

		For detailed documentation on Observability (Prometheus metrics, tracing, and logging), please refer to [docs/observability/](../../docs/observability/).