zombienet_orchestrator/
observability.rs1use std::path::Path;
2
3use configuration::ObservabilityConfig;
4use support::fs::FileSystem;
5use tokio::process::Command;
6use tracing::{debug, trace};
7
8use crate::{generators, network::node::NetworkNode};
9
10#[derive(Debug, Clone, Default)]
12pub enum ObservabilityState {
13 #[default]
15 NotStarted,
16 Running(ObservabilityInfo),
18 Stopped,
20}
21
22#[derive(Debug, Clone)]
24pub struct ObservabilityInfo {
25 pub prometheus_url: String,
27 pub grafana_url: String,
29 prometheus_container_name: String,
31 grafana_container_name: String,
33 container_runtime: String,
35}
36
37impl ObservabilityState {
38 pub fn as_runnnig(&self) -> Option<&ObservabilityInfo> {
39 match self {
40 ObservabilityState::Running(info) => Some(info),
41 _ => None,
42 }
43 }
44}
45
46pub async fn spawn_observability_stack<T: FileSystem>(
48 config: &ObservabilityConfig,
49 nodes: &[&NetworkNode],
50 ns_name: &str,
51 base_dir: &Path,
52 filesystem: &T,
53) -> Result<ObservabilityInfo, anyhow::Error> {
54 let container_runtime = detect_container_runtime().await?;
55 debug!("Using container runtime: {container_runtime}");
56
57 let (host_addr, use_host_network) = get_networking();
58
59 let prom_parked = generators::generate_node_port(config.prometheus_port())?;
60 let prom_port = prom_parked.0;
61 let grafana_parked = generators::generate_node_port(config.grafana_port())?;
62 let grafana_port = grafana_parked.0;
63
64 let obs_dir = base_dir.join("observability");
66 let prom_dir = obs_dir.join("prometheus");
67 let grafana_ds_dir = obs_dir.join("grafana/provisioning/datasources");
68 filesystem.create_dir_all(&prom_dir).await?;
69 filesystem.create_dir_all(&grafana_ds_dir).await?;
70
71 let prom_config = generate_prometheus_config(nodes, &host_addr);
73 trace!("Generated prometheus.yml:\n{prom_config}");
74 filesystem
75 .write(prom_dir.join("prometheus.yml"), prom_config.as_bytes())
76 .await?;
77
78 let prom_url_for_grafana = if use_host_network {
79 format!("http://127.0.0.1:{prom_port}")
80 } else {
81 format!("http://{host_addr}:{prom_port}")
82 };
83
84 let grafana_ds = generate_grafana_datasource(&prom_url_for_grafana);
85 trace!("Generated grafana datasource.yml:\n{grafana_ds}");
86 filesystem
87 .write(grafana_ds_dir.join("datasource.yml"), grafana_ds.as_bytes())
88 .await?;
89
90 let prom_container = format!("{ns_name}-prometheus");
91 let grafana_container = format!("{ns_name}-grafana");
92
93 let mut prom_cmd = Command::new(&container_runtime);
94 prom_cmd.args([
95 "run",
96 "-d",
97 "--name",
98 &prom_container,
99 "-v",
100 &format!("{}:/etc/prometheus", prom_dir.display()),
101 ]);
102
103 if use_host_network {
104 prom_cmd.args(["--network=host"]);
105 } else {
106 prom_cmd.args(["-p", &format!("{prom_port}:9090")]);
107 }
108
109 prom_parked.drop_listener();
110
111 prom_cmd.args([
112 config.prometheus_image(),
113 "--config.file=/etc/prometheus/prometheus.yml",
114 "--storage.tsdb.path=/prometheus",
115 ]);
116
117 if use_host_network {
118 prom_cmd.arg(format!("--web.listen-address=0.0.0.0:{prom_port}"));
120 }
121
122 let output = prom_cmd.output().await?;
123 if !output.status.success() {
124 let stderr = String::from_utf8_lossy(&output.stderr);
125 return Err(anyhow::anyhow!(
126 "Failed to start Prometheus container: {stderr}"
127 ));
128 }
129 debug!("Prometheus container started: {prom_container}");
130
131 let mut grafana_cmd = Command::new(&container_runtime);
132 grafana_cmd.args([
133 "run",
134 "-d",
135 "--name",
136 &grafana_container,
137 "-v",
138 &format!(
139 "{}:/etc/grafana/provisioning",
140 obs_dir.join("grafana/provisioning").display()
141 ),
142 "-e",
143 "GF_AUTH_ANONYMOUS_ENABLED=true",
144 "-e",
145 "GF_AUTH_ANONYMOUS_ORG_ROLE=Admin",
146 "-e",
147 "GF_SECURITY_ADMIN_PASSWORD=admin",
148 ]);
149
150 if use_host_network {
151 grafana_cmd.args(["--network=host"]);
152 grafana_cmd.args(["-e", &format!("GF_SERVER_HTTP_PORT={grafana_port}")]);
153 } else {
154 grafana_cmd.args(["-p", &format!("{grafana_port}:3000")]);
155 }
156
157 grafana_cmd.arg(config.grafana_image());
158
159 grafana_parked.drop_listener();
160
161 let output = grafana_cmd.output().await?;
162 if !output.status.success() {
163 let stderr = String::from_utf8_lossy(&output.stderr);
164 let _ = Command::new(&container_runtime)
166 .args(["rm", "--force", &prom_container])
167 .output()
168 .await;
169 return Err(anyhow::anyhow!(
170 "Failed to start Grafana container: {stderr}"
171 ));
172 }
173 debug!("Grafana container started: {grafana_container}");
174
175 Ok(ObservabilityInfo {
176 prometheus_url: format!("http://127.0.0.1:{prom_port}"),
177 grafana_url: format!("http://127.0.0.1:{grafana_port}"),
178 prometheus_container_name: prom_container,
179 grafana_container_name: grafana_container,
180 container_runtime,
181 })
182}
183
184pub async fn cleanup_observability_stack(info: &ObservabilityInfo) -> Result<(), anyhow::Error> {
185 for container in [
186 &info.prometheus_container_name,
187 &info.grafana_container_name,
188 ] {
189 let output = Command::new(&info.container_runtime)
190 .args(["rm", "--force", container])
191 .output()
192 .await?;
193
194 if !output.status.success() {
195 let stderr = String::from_utf8_lossy(&output.stderr);
196 debug!("Warning: failed to remove container {container}: {stderr}");
197 }
198 }
199
200 Ok(())
201}
202
203pub(crate) fn generate_prometheus_config(nodes: &[&NetworkNode], host_addr: &str) -> String {
204 let mut targets = String::new();
205
206 for node in nodes {
207 if let Some(port) = extract_port_from_uri(&node.prometheus_uri) {
208 targets.push_str(&format!(
209 " - targets: ['{host_addr}:{port}']\n labels:\n node: '{}'\n",
210 node.name
211 ));
212 }
213 }
214
215 format!(
216 "global:\n scrape_interval: 5s\n evaluation_interval: 5s\n\nscrape_configs:\n - job_name: 'zombienet'\n metrics_path: /metrics\n static_configs:\n{targets}"
217 )
218}
219
220pub(crate) fn generate_grafana_datasource(prom_url: &str) -> String {
221 format!(
222 "apiVersion: 1\ndatasources:\n - name: Prometheus\n type: prometheus\n access: proxy\n url: {prom_url}\n isDefault: true\n editable: true\n"
223 )
224}
225
226async fn detect_container_runtime() -> Result<String, anyhow::Error> {
227 if let Ok(output) = Command::new("docker").arg("version").output().await {
228 if output.status.success() {
229 let stdout = String::from_utf8_lossy(&output.stdout);
230 if stdout.to_lowercase().contains("podman") {
231 return Ok("podman".to_string());
232 }
233 return Ok("docker".to_string());
234 }
235 }
236
237 if let Ok(output) = Command::new("podman").arg("version").output().await {
238 if output.status.success() {
239 return Ok("podman".to_string());
240 }
241 }
242
243 Err(anyhow::anyhow!(
244 "No container runtime found. Install Docker or Podman to use the observability stack."
245 ))
246}
247
248fn get_networking() -> (String, bool) {
249 match std::env::consts::OS {
250 "linux" => ("127.0.0.1".to_string(), true),
251 _ => ("host.docker.internal".to_string(), false),
252 }
253}
254
255fn extract_port_from_uri(uri: &str) -> Option<u16> {
256 uri.rsplit(':')
257 .next()
258 .and_then(|s| s.split('/').next())
259 .and_then(|s| s.parse().ok())
260}