Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
65c4f5b
refactor: consolidate observability stack into unified docker-observa…
keivenchang Nov 6, 2025
3b63e21
Reorganize observability configs and improve tracing docs
keivenchang Nov 6, 2025
0b71cf8
Standardize observability docs structure and improve clarity
keivenchang Nov 7, 2025
9cd7cf1
Add new observability docs to toctree
keivenchang Nov 7, 2025
8a80eee
Remove Optional Observability Stack section from README
keivenchang Nov 7, 2025
b6398ef
Fix documentation issues from observability reorganization
keivenchang Nov 7, 2025
ef2e529
Fix broken link anchor in Python metrics README
keivenchang Nov 7, 2025
4c1e061
Fix broken link in Python metrics examples README
keivenchang Nov 7, 2025
de7a07a
Simplify Grafana dashboard names and improve logging docs
keivenchang Nov 7, 2025
67541d0
Reorganize and standardize observability documentation
keivenchang Nov 7, 2025
6cac372
logging to not mention OTEL
keivenchang Nov 7, 2025
f220b9b
Merge branch 'main' into keivenchang/DIS-980__consolidate-OTEL-docker…
keivenchang Nov 7, 2025
8eabedd
Fix race condition in test_http_service
keivenchang Nov 8, 2025
ac5a72c
Clarify logging terminology and fix copyright headers
keivenchang Nov 8, 2025
24a15fd
add back README.md references, and removed unused podmonitor files
keivenchang Nov 10, 2025
1156f7c
Merge branch 'main' into keivenchang/DIS-980__consolidate-OTEL-docker…
keivenchang Nov 10, 2025
d53444c
Remove remaining DYN_SYSTEM_ENABLED variables
keivenchang Nov 10, 2025
db37a76
cargo fmt
keivenchang Nov 10, 2025
c00d419
Add dynamomodel-guide.md to hidden toctree
keivenchang Nov 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,8 @@ To coordinate across a data center, Dynamo relies on etcd and NATS. To run Dynam

To quickly setup etcd & NATS, you can also run:

```
```bash
# At the root of the repository:
# Edit deploy/docker-compose.yml to comment out "runtime: nvidia" of the dcgm-exporter service if the nvidia container runtime isn't deployed or to be used.
docker compose -f deploy/docker-compose.yml up -d
```

Expand Down
125 changes: 3 additions & 122 deletions deploy/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,26 +1,13 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml
# Bare minimum infrastructure services for Dynamo.
# For observability (metrics, tracing, dashboards), use docker-compose-observability.yml

networks:
server:
driver: bridge
monitoring:
driver: bridge

# Note that the images are pinned to specific versions to avoid breaking changes.
services:
nats-server:
image: nats:2.11.4
Expand All @@ -31,7 +18,6 @@ services:
- 8222:8222 # the endpoints include /varz, /healthz, ...
networks:
- server
- monitoring

etcd-server:
image: bitnamilegacy/etcd:3.6.1
Expand All @@ -42,108 +28,3 @@ services:
- 2380:2380
networks:
- server
- monitoring

# All the services below are part of the metrics profile and monitoring network.

# The exporter translates from /varz and other stats to Prometheus metrics
nats-prometheus-exporter:
image: natsio/prometheus-nats-exporter:0.17.3
command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
ports:
- 7777:7777
networks:
- monitoring
profiles: [metrics]
depends_on:
- nats-server

# DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
# dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
dcgm-exporter:
image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
ports:
# Expose dcgm-exporter on port 9401 both inside and outside the container
# to avoid conflicts with other dcgm-exporter instances in distributed environments.
# To access DCGM metrics:
# Outside the container: curl http://localhost:9401/metrics (or the host IP)
# Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
- 9401:9401
cap_add:
- SYS_ADMIN
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
# dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
- NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
- DCGM_EXPORTER_LISTEN=:9401
runtime: nvidia # Specify the NVIDIA runtime
networks:
- monitoring

# To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 9090/tcp
prometheus:
image: prom/prometheus:v3.4.1
container_name: prometheus
volumes:
- ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
# These provide the web console functionality
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
# Example to pull from the /query endpoint:
# {__name__=~"DCGM.*", job="dcgm-exporter"}
networks:
- monitoring
ports:
- "9090:9090"
profiles: [metrics]
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
- dcgm-exporter
- nats-prometheus-exporter
- etcd-server

# grafana connects to prometheus via the /query endpoint.
# Default credentials are dynamo/dynamo.
# To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 3001/tcp
grafana:
image: grafana/grafana-enterprise:12.0.1
container_name: grafana
volumes:
- ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards
- ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
environment:
- GF_SERVER_HTTP_PORT=3001
# do not make it admin/admin, because you will be prompted to change the password every time
- GF_SECURITY_ADMIN_USER=dynamo
- GF_SECURITY_ADMIN_PASSWORD=dynamo
- GF_USERS_ALLOW_SIGN_UP=false
- GF_INSTALL_PLUGINS=grafana-piechart-panel
# Default min interval is 5s, but can be configured lower
- GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
# Disable password change requirement
- GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
- GF_SECURITY_ADMIN_PASSWORD_POLICY=false
- GF_AUTH_DISABLE_LOGIN_FORM=false
- GF_AUTH_DISABLE_SIGNOUT_MENU=false
restart: unless-stopped
ports:
- "3001:3001"
networks:
- monitoring
profiles: [metrics]
depends_on:
- prometheus
137 changes: 137 additions & 0 deletions deploy/docker-observability.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Observability stack for Dynamo: metrics, tracing, and visualization.
# Requires deploy/docker-compose.yml to be running for NATS and etcd connectivity.
#
# Usage:
# docker compose -f deploy/docker-observability.yml up -d

version: '3.8'

networks:
server:
external: true
name: deploy_server

volumes:
grafana-data:
tempo-data:

services:
# DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
# dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
dcgm-exporter:
image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
ports:
# Expose dcgm-exporter on port 9401 both inside and outside the container
# to avoid conflicts with other dcgm-exporter instances in distributed environments.
# To access DCGM metrics:
# Outside the container: curl http://localhost:9401/metrics (or the host IP)
# Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
- 9401:9401
cap_add:
- SYS_ADMIN
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
# dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
- NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
- DCGM_EXPORTER_LISTEN=:9401
runtime: nvidia # Specify the NVIDIA runtime
networks:
- server

# The exporter translates from /varz and other stats to Prometheus metrics
nats-prometheus-exporter:
image: natsio/prometheus-nats-exporter:0.17.3
command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
ports:
- 7777:7777
networks:
- server

# To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 9090/tcp
prometheus:
image: prom/prometheus:v3.4.1
container_name: prometheus
volumes:
- ./observability/prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
# These provide the web console functionality
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
# Example to pull from the /query endpoint:
# {__name__=~"DCGM.*", job="dcgm-exporter"}
ports:
- "9090:9090"
networks:
- server
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
- dcgm-exporter
- nats-prometheus-exporter

# Tempo - Distributed tracing backend
tempo:
image: grafana/tempo:2.8.2
command: [ "-config.file=/etc/tempo.yaml" ]
user: root
volumes:
- ./observability/tempo.yaml:/etc/tempo.yaml
- tempo-data:/tmp/tempo
ports:
- "3200:3200" # Tempo HTTP
- "4317:4317" # OTLP gRPC receiver (accessible from host)
- "4318:4318" # OTLP HTTP receiver (accessible from host)
networks:
- server

# Grafana - Visualization and dashboards
# Supports both Prometheus (metrics) and Tempo (tracing) datasources
# Default credentials: dynamo/dynamo
# To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 3000/tcp
grafana:
image: grafana/grafana:12.2.0
container_name: grafana
volumes:
- grafana-data:/var/lib/grafana
- ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards
- ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml
- ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml
environment:
- GF_SERVER_HTTP_PORT=3000
# do not make it admin/admin, because you will be prompted to change the password every time
- GF_SECURITY_ADMIN_USER=dynamo
- GF_SECURITY_ADMIN_PASSWORD=dynamo
- GF_USERS_ALLOW_SIGN_UP=false
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
- GF_INSTALL_PLUGINS=grafana-piechart-panel
# Default min interval is 5s, but can be configured lower
- GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
# Disable password change requirement
- GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
- GF_SECURITY_ADMIN_PASSWORD_POLICY=false
- GF_AUTH_DISABLE_LOGIN_FORM=false
- GF_AUTH_DISABLE_SIGNOUT_MENU=false
restart: unless-stopped
ports:
- "3000:3000"
networks:
- server
depends_on:
- prometheus
- tempo

Original file line number Diff line number Diff line change
Expand Up @@ -1020,7 +1020,7 @@
},
"timepicker": {},
"timezone": "browser",
"title": "Dynamo Dashboard",
"title": "Dynamo Dashboard (generic)",
"uid": "97ae8df9-138a-4f7a-9b0f-635b77d818fe",
"version": 1
}
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -1002,7 +1002,7 @@ data:
},
"timepicker": {},
"timezone": "browser",
"title": "Dynamo Dashboard",
"title": "Dynamo Dashboard (generic)",
"uid": "dynamo-dashboard",
"version": 1
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Dynamo Logging on Kubernetes

For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../docs/kubernetes/observability/logging.md).
For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../../../docs/kubernetes/observability/logging.md).
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ datasources:
access: proxy
url: http://tempo:3200
uid: tempo
isDefault: true
isDefault: false
editable: true
jsonData:
httpMethod: GET
Expand Down
File renamed without changes.
35 changes: 0 additions & 35 deletions deploy/tracing/docker-compose.yml

This file was deleted.

6 changes: 5 additions & 1 deletion docs/_sections/observability.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ Observability
.. toctree::
:hidden:

Overview <../observability/README>
Prometheus + Grafana Setup <../observability/prometheus-grafana>
Metrics <../observability/metrics>
Metrics Developer Guide <../observability/metrics-developer-guide>
Health Checks <../observability/health-checks>
Tracing <../observability/tracing>
Logging <../observability/logging>
Health Checks <../observability/health-checks>
Loading
Loading