zombienet_orchestrator/
observability.rs

1use std::path::Path;
2
3use configuration::ObservabilityConfig;
4use support::fs::FileSystem;
5use tokio::process::Command;
6use tracing::{debug, trace};
7
8use crate::{generators, network::node::NetworkNode};
9
10/// Lifecycle state of the observability stack within a network
11#[derive(Debug, Clone, Default)]
12pub enum ObservabilityState {
13    /// Stack has never been started
14    #[default]
15    NotStarted,
16    /// Stack is running
17    Running(ObservabilityInfo),
18    /// Stack was running bu has been stopped
19    Stopped,
20}
21
22/// Information about a running observability stack
23#[derive(Debug, Clone)]
24pub struct ObservabilityInfo {
25    /// URL where Prometheus is accessible
26    pub prometheus_url: String,
27    /// URL where Grafana is accessible
28    pub grafana_url: String,
29    /// Name of the Prometheus Docker container
30    prometheus_container_name: String,
31    /// Name of the Grafana Docker container
32    grafana_container_name: String,
33    /// Container runtime binary (docker or podman)
34    container_runtime: String,
35}
36
37impl ObservabilityState {
38    pub fn as_runnnig(&self) -> Option<&ObservabilityInfo> {
39        match self {
40            ObservabilityState::Running(info) => Some(info),
41            _ => None,
42        }
43    }
44}
45
46/// Spawn the observability stack (Prometheus + Grafana) as Docker/Podman containers
47pub async fn spawn_observability_stack<T: FileSystem>(
48    config: &ObservabilityConfig,
49    nodes: &[&NetworkNode],
50    ns_name: &str,
51    base_dir: &Path,
52    filesystem: &T,
53) -> Result<ObservabilityInfo, anyhow::Error> {
54    let container_runtime = detect_container_runtime().await?;
55    debug!("Using container runtime: {container_runtime}");
56
57    let (host_addr, use_host_network) = get_networking();
58
59    let prom_parked = generators::generate_node_port(config.prometheus_port())?;
60    let prom_port = prom_parked.0;
61    let grafana_parked = generators::generate_node_port(config.grafana_port())?;
62    let grafana_port = grafana_parked.0;
63
64    // Create dirs
65    let obs_dir = base_dir.join("observability");
66    let prom_dir = obs_dir.join("prometheus");
67    let grafana_ds_dir = obs_dir.join("grafana/provisioning/datasources");
68    filesystem.create_dir_all(&prom_dir).await?;
69    filesystem.create_dir_all(&grafana_ds_dir).await?;
70
71    // Generate and write Prometheus config
72    let prom_config = generate_prometheus_config(nodes, &host_addr);
73    trace!("Generated prometheus.yml:\n{prom_config}");
74    filesystem
75        .write(prom_dir.join("prometheus.yml"), prom_config.as_bytes())
76        .await?;
77
78    let prom_url_for_grafana = if use_host_network {
79        format!("http://127.0.0.1:{prom_port}")
80    } else {
81        format!("http://{host_addr}:{prom_port}")
82    };
83
84    let grafana_ds = generate_grafana_datasource(&prom_url_for_grafana);
85    trace!("Generated grafana datasource.yml:\n{grafana_ds}");
86    filesystem
87        .write(grafana_ds_dir.join("datasource.yml"), grafana_ds.as_bytes())
88        .await?;
89
90    let prom_container = format!("{ns_name}-prometheus");
91    let grafana_container = format!("{ns_name}-grafana");
92
93    let mut prom_cmd = Command::new(&container_runtime);
94    prom_cmd.args([
95        "run",
96        "-d",
97        "--name",
98        &prom_container,
99        "-v",
100        &format!("{}:/etc/prometheus", prom_dir.display()),
101    ]);
102
103    if use_host_network {
104        prom_cmd.args(["--network=host"]);
105    } else {
106        prom_cmd.args(["-p", &format!("{prom_port}:9090")]);
107    }
108
109    prom_parked.drop_listener();
110
111    prom_cmd.args([
112        config.prometheus_image(),
113        "--config.file=/etc/prometheus/prometheus.yml",
114        "--storage.tsdb.path=/prometheus",
115    ]);
116
117    if use_host_network {
118        // When using host network, override the listen address to use the assigned port
119        prom_cmd.arg(format!("--web.listen-address=0.0.0.0:{prom_port}"));
120    }
121
122    let output = prom_cmd.output().await?;
123    if !output.status.success() {
124        let stderr = String::from_utf8_lossy(&output.stderr);
125        return Err(anyhow::anyhow!(
126            "Failed to start Prometheus container: {stderr}"
127        ));
128    }
129    debug!("Prometheus container started: {prom_container}");
130
131    let mut grafana_cmd = Command::new(&container_runtime);
132    grafana_cmd.args([
133        "run",
134        "-d",
135        "--name",
136        &grafana_container,
137        "-v",
138        &format!(
139            "{}:/etc/grafana/provisioning",
140            obs_dir.join("grafana/provisioning").display()
141        ),
142        "-e",
143        "GF_AUTH_ANONYMOUS_ENABLED=true",
144        "-e",
145        "GF_AUTH_ANONYMOUS_ORG_ROLE=Admin",
146        "-e",
147        "GF_SECURITY_ADMIN_PASSWORD=admin",
148    ]);
149
150    if use_host_network {
151        grafana_cmd.args(["--network=host"]);
152        grafana_cmd.args(["-e", &format!("GF_SERVER_HTTP_PORT={grafana_port}")]);
153    } else {
154        grafana_cmd.args(["-p", &format!("{grafana_port}:3000")]);
155    }
156
157    grafana_cmd.arg(config.grafana_image());
158
159    grafana_parked.drop_listener();
160
161    let output = grafana_cmd.output().await?;
162    if !output.status.success() {
163        let stderr = String::from_utf8_lossy(&output.stderr);
164        // Clean up Prometheus container on failure
165        let _ = Command::new(&container_runtime)
166            .args(["rm", "--force", &prom_container])
167            .output()
168            .await;
169        return Err(anyhow::anyhow!(
170            "Failed to start Grafana container: {stderr}"
171        ));
172    }
173    debug!("Grafana container started: {grafana_container}");
174
175    Ok(ObservabilityInfo {
176        prometheus_url: format!("http://127.0.0.1:{prom_port}"),
177        grafana_url: format!("http://127.0.0.1:{grafana_port}"),
178        prometheus_container_name: prom_container,
179        grafana_container_name: grafana_container,
180        container_runtime,
181    })
182}
183
184pub async fn cleanup_observability_stack(info: &ObservabilityInfo) -> Result<(), anyhow::Error> {
185    for container in [
186        &info.prometheus_container_name,
187        &info.grafana_container_name,
188    ] {
189        let output = Command::new(&info.container_runtime)
190            .args(["rm", "--force", container])
191            .output()
192            .await?;
193
194        if !output.status.success() {
195            let stderr = String::from_utf8_lossy(&output.stderr);
196            debug!("Warning: failed to remove container {container}: {stderr}");
197        }
198    }
199
200    Ok(())
201}
202
203pub(crate) fn generate_prometheus_config(nodes: &[&NetworkNode], host_addr: &str) -> String {
204    let mut targets = String::new();
205
206    for node in nodes {
207        if let Some(port) = extract_port_from_uri(&node.prometheus_uri) {
208            targets.push_str(&format!(
209                "      - targets: ['{host_addr}:{port}']\n        labels:\n          node: '{}'\n",
210                node.name
211            ));
212        }
213    }
214
215    format!(
216        "global:\n  scrape_interval: 5s\n  evaluation_interval: 5s\n\nscrape_configs:\n  - job_name: 'zombienet'\n    metrics_path: /metrics\n    static_configs:\n{targets}"
217    )
218}
219
220pub(crate) fn generate_grafana_datasource(prom_url: &str) -> String {
221    format!(
222        "apiVersion: 1\ndatasources:\n  - name: Prometheus\n    type: prometheus\n    access: proxy\n    url: {prom_url}\n    isDefault: true\n    editable: true\n"
223    )
224}
225
226async fn detect_container_runtime() -> Result<String, anyhow::Error> {
227    if let Ok(output) = Command::new("docker").arg("version").output().await {
228        if output.status.success() {
229            let stdout = String::from_utf8_lossy(&output.stdout);
230            if stdout.to_lowercase().contains("podman") {
231                return Ok("podman".to_string());
232            }
233            return Ok("docker".to_string());
234        }
235    }
236
237    if let Ok(output) = Command::new("podman").arg("version").output().await {
238        if output.status.success() {
239            return Ok("podman".to_string());
240        }
241    }
242
243    Err(anyhow::anyhow!(
244        "No container runtime found. Install Docker or Podman to use the observability stack."
245    ))
246}
247
248fn get_networking() -> (String, bool) {
249    match std::env::consts::OS {
250        "linux" => ("127.0.0.1".to_string(), true),
251        _ => ("host.docker.internal".to_string(), false),
252    }
253}
254
255fn extract_port_from_uri(uri: &str) -> Option<u16> {
256    uri.rsplit(':')
257        .next()
258        .and_then(|s| s.split('/').next())
259        .and_then(|s| s.parse().ok())
260}