referrerpolicy=no-referrer-when-downgrade

polkadot_node_core_pvf/
metrics.rs

1// Copyright (C) Parity Technologies (UK) Ltd.
2// This file is part of Polkadot.
3
4// Polkadot is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// Polkadot is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13
14// You should have received a copy of the GNU General Public License
15// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
16
17//! Prometheus metrics related to the validation host.
18
19use polkadot_node_core_pvf_common::prepare::MemoryStats;
20use polkadot_node_metrics::metrics::{self, prometheus};
21use polkadot_node_subsystem::messages::PvfExecKind;
22
23/// Validation host metrics.
24#[derive(Default, Clone)]
25pub struct Metrics(Option<MetricsInner>);
26
27impl Metrics {
28	/// Returns a handle to submit prepare workers metrics.
29	pub(crate) fn prepare_worker(&'_ self) -> WorkerRelatedMetrics<'_> {
30		WorkerRelatedMetrics { metrics: self, flavor: WorkerFlavor::Prepare }
31	}
32
33	/// Returns a handle to submit execute workers metrics.
34	pub(crate) fn execute_worker(&'_ self) -> WorkerRelatedMetrics<'_> {
35		WorkerRelatedMetrics { metrics: self, flavor: WorkerFlavor::Execute }
36	}
37
38	/// When preparation pipeline had a new item enqueued.
39	pub(crate) fn prepare_enqueued(&self) {
40		if let Some(metrics) = &self.0 {
41			metrics.prepare_enqueued.inc();
42		}
43	}
44
45	/// When preparation pipeline concluded working on an item.
46	pub(crate) fn prepare_concluded(&self) {
47		if let Some(metrics) = &self.0 {
48			metrics.prepare_concluded.inc();
49		}
50	}
51
52	/// When execution pipeline had a new item enqueued.
53	pub(crate) fn execute_enqueued(&self) {
54		if let Some(metrics) = &self.0 {
55			metrics.execute_enqueued.inc();
56		}
57	}
58
59	/// When execution pipeline finished executing a request.
60	pub(crate) fn execute_finished(&self) {
61		if let Some(metrics) = &self.0 {
62			metrics.execute_finished.inc();
63		}
64	}
65
66	/// Time between sending preparation request to a worker to having the response.
67	pub(crate) fn time_preparation(
68		&self,
69	) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
70		self.0.as_ref().map(|metrics| metrics.preparation_time.start_timer())
71	}
72
73	/// Time between sending execution request to a worker to having the response.
74	pub(crate) fn time_execution(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
75		self.0.as_ref().map(|metrics| metrics.execution_time.start_timer())
76	}
77
78	pub(crate) fn observe_execution_queued_time(&self, queued_for_millis: u32) {
79		self.0.as_ref().map(|metrics| {
80			metrics.execution_queued_time.observe(queued_for_millis as f64 / 1000 as f64)
81		});
82	}
83
84	/// Observe memory stats for preparation.
85	#[allow(unused_variables)]
86	pub(crate) fn observe_preparation_memory_metrics(&self, memory_stats: MemoryStats) {
87		if let Some(metrics) = &self.0 {
88			#[cfg(target_os = "linux")]
89			if let Some(max_rss) = memory_stats.max_rss {
90				metrics.preparation_max_rss.observe(max_rss as f64);
91			}
92
93			#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
94			if let Some(tracker_stats) = memory_stats.memory_tracker_stats {
95				// We convert these stats from B to KB to match the unit of `ru_maxrss` from
96				// `getrusage`.
97				let max_resident_kb = (tracker_stats.resident / 1024) as f64;
98				let max_allocated_kb = (tracker_stats.allocated / 1024) as f64;
99
100				metrics.preparation_max_resident.observe(max_resident_kb);
101				metrics.preparation_max_allocated.observe(max_allocated_kb);
102			}
103
104			metrics
105				.preparation_peak_tracked_allocation
106				.observe((memory_stats.peak_tracked_alloc / 1024) as f64);
107		}
108	}
109
110	pub(crate) fn observe_code_size(&self, code_size: usize) {
111		if let Some(metrics) = &self.0 {
112			metrics.code_size.observe(code_size as f64);
113		}
114	}
115
116	pub(crate) fn observe_pov_size(&self, pov_size: usize, compressed: bool) {
117		if let Some(metrics) = &self.0 {
118			metrics
119				.pov_size
120				.with_label_values(&[if compressed { "true" } else { "false" }])
121				.observe(pov_size as f64);
122		}
123	}
124
125	/// When preparation pipeline concluded working on an item.
126	pub(crate) fn on_execute_kind(&self, kind: PvfExecKind) {
127		if let Some(metrics) = &self.0 {
128			metrics.exec_kind_selected.with_label_values(&[kind.as_str()]).inc();
129		}
130	}
131}
132
133#[derive(Clone)]
134struct MetricsInner {
135	worker_spawning: prometheus::CounterVec<prometheus::U64>,
136	worker_spawned: prometheus::CounterVec<prometheus::U64>,
137	worker_retired: prometheus::CounterVec<prometheus::U64>,
138	prepare_enqueued: prometheus::Counter<prometheus::U64>,
139	prepare_concluded: prometheus::Counter<prometheus::U64>,
140	execute_enqueued: prometheus::Counter<prometheus::U64>,
141	execute_finished: prometheus::Counter<prometheus::U64>,
142	preparation_time: prometheus::Histogram,
143	execution_time: prometheus::Histogram,
144	execution_queued_time: prometheus::Histogram,
145	#[cfg(target_os = "linux")]
146	preparation_max_rss: prometheus::Histogram,
147	// Max. allocated memory, tracked by Jemallocator, polling-based
148	#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
149	preparation_max_allocated: prometheus::Histogram,
150	// Max. resident memory, tracked by Jemallocator, polling-based
151	#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
152	preparation_max_resident: prometheus::Histogram,
153	// Peak allocation value, tracked by tracking-allocator
154	preparation_peak_tracked_allocation: prometheus::Histogram,
155	pov_size: prometheus::HistogramVec,
156	code_size: prometheus::Histogram,
157	exec_kind_selected: prometheus::CounterVec<prometheus::U64>,
158}
159
160impl metrics::Metrics for Metrics {
161	fn try_register(registry: &prometheus::Registry) -> Result<Self, prometheus::PrometheusError> {
162		let inner = MetricsInner {
163			worker_spawning: prometheus::register(
164				prometheus::CounterVec::new(
165					prometheus::Opts::new(
166						"polkadot_pvf_worker_spawning",
167						"The total number of workers began to spawn",
168					),
169					&["flavor"],
170				)?,
171				registry,
172			)?,
173			worker_spawned: prometheus::register(
174				prometheus::CounterVec::new(
175					prometheus::Opts::new(
176						"polkadot_pvf_worker_spawned",
177						"The total number of workers spawned successfully",
178					),
179					&["flavor"],
180				)?,
181				registry,
182			)?,
183			worker_retired: prometheus::register(
184				prometheus::CounterVec::new(
185					prometheus::Opts::new(
186						"polkadot_pvf_worker_retired",
187						"The total number of workers retired, either killed by the host or died on duty",
188					),
189					&["flavor"],
190				)?,
191				registry,
192			)?,
193			prepare_enqueued: prometheus::register(
194				prometheus::Counter::new(
195					"polkadot_pvf_prepare_enqueued",
196					"The total number of jobs enqueued into the preparation pipeline"
197				)?,
198				registry,
199			)?,
200			prepare_concluded: prometheus::register(
201				prometheus::Counter::new(
202					"polkadot_pvf_prepare_concluded",
203					"The total number of jobs concluded in the preparation pipeline"
204				)?,
205				registry,
206			)?,
207			execute_enqueued: prometheus::register(
208				prometheus::Counter::new(
209					"polkadot_pvf_execute_enqueued",
210					"The total number of jobs enqueued into the execution pipeline"
211				)?,
212				registry,
213			)?,
214			execute_finished: prometheus::register(
215				prometheus::Counter::new(
216					"polkadot_pvf_execute_finished",
217					"The total number of jobs done in the execution pipeline"
218				)?,
219				registry,
220			)?,
221			preparation_time: prometheus::register(
222				prometheus::Histogram::with_opts(
223					prometheus::HistogramOpts::new(
224						"polkadot_pvf_preparation_time",
225						"Time spent in preparing PVF artifacts in seconds",
226					)
227					.buckets(vec![
228						// This is synchronized with the `DEFAULT_PRECHECK_PREPARATION_TIMEOUT=60s`
229						// and `DEFAULT_LENIENT_PREPARATION_TIMEOUT=360s` constants found in
230						// node/core/candidate-validation/src/lib.rs
231						0.1,
232						0.5,
233						1.0,
234						2.0,
235						3.0,
236						10.0,
237						20.0,
238						30.0,
239						60.0,
240						120.0,
241						240.0,
242						360.0,
243						480.0,
244					]),
245				)?,
246				registry,
247			)?,
248			execution_time: prometheus::register(
249				prometheus::Histogram::with_opts(
250					prometheus::HistogramOpts::new(
251						"polkadot_pvf_execution_time",
252						"Time spent in executing PVFs",
253					).buckets(vec![
254						// This is synchronized with `DEFAULT_APPROVAL_EXECUTION_TIMEOUT` and
255						// `DEFAULT_BACKING_EXECUTION_TIMEOUT` constants in
256						// node/core/candidate-validation/src/lib.rs
257						0.01,
258						0.025,
259						0.05,
260						0.1,
261						0.25,
262						0.5,
263						1.0,
264						2.0,
265						3.0,
266						4.0,
267						5.0,
268						6.0,
269						8.0,
270						10.0,
271						12.0,
272					]),
273				)?,
274				registry,
275			)?,
276			execution_queued_time: prometheus::register(
277				prometheus::Histogram::with_opts(
278					prometheus::HistogramOpts::new(
279						"polkadot_pvf_execution_queued_time",
280						"Time spent in queue waiting for PVFs execution job to be assigned",
281					).buckets(vec![
282						0.01,
283						0.025,
284						0.05,
285						0.1,
286						0.25,
287						0.5,
288						1.0,
289						2.0,
290						3.0,
291						4.0,
292						5.0,
293						6.0,
294						12.0,
295						24.0,
296						48.0,
297					]),
298				)?,
299				registry,
300			)?,
301			#[cfg(target_os = "linux")]
302			preparation_max_rss: prometheus::register(
303				prometheus::Histogram::with_opts(
304					prometheus::HistogramOpts::new(
305						"polkadot_pvf_preparation_max_rss",
306						"ru_maxrss (maximum resident set size) observed for preparation (in kilobytes)",
307					).buckets(
308						prometheus::exponential_buckets(8192.0, 2.0, 10)
309							.expect("arguments are always valid; qed"),
310					),
311				)?,
312				registry,
313			)?,
314			#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
315			preparation_max_resident: prometheus::register(
316				prometheus::Histogram::with_opts(
317					prometheus::HistogramOpts::new(
318						"polkadot_pvf_preparation_max_resident",
319						"max resident memory observed for preparation (in kilobytes)",
320					).buckets(
321						prometheus::exponential_buckets(8192.0, 2.0, 10)
322							.expect("arguments are always valid; qed"),
323					),
324				)?,
325				registry,
326			)?,
327			#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
328			preparation_max_allocated: prometheus::register(
329				prometheus::Histogram::with_opts(
330					prometheus::HistogramOpts::new(
331						"polkadot_pvf_preparation_max_allocated",
332						"max allocated memory observed for preparation (in kilobytes)",
333					).buckets(
334						prometheus::exponential_buckets(8192.0, 2.0, 10)
335							.expect("arguments are always valid; qed"),
336					),
337				)?,
338				registry,
339			)?,
340			preparation_peak_tracked_allocation: prometheus::register(
341				prometheus::Histogram::with_opts(
342					prometheus::HistogramOpts::new(
343						"polkadot_pvf_preparation_peak_tracked_allocation",
344						"peak allocation observed for preparation (in kilobytes)",
345					).buckets(
346						prometheus::exponential_buckets(8192.0, 2.0, 10)
347							.expect("arguments are always valid; qed"),
348					),
349				)?,
350				registry,
351			)?,
352			// The following metrics was moved here from the candidate valiidation subsystem.
353			// Names are kept to avoid breaking dashboards and stuff.
354			pov_size: prometheus::register(
355				prometheus::HistogramVec::new(
356					prometheus::HistogramOpts::new(
357						"polkadot_parachain_candidate_validation_pov_size",
358						"The compressed and decompressed size of the proof of validity of a candidate",
359					)
360					.buckets(
361						prometheus::exponential_buckets(16384.0, 2.0, 10)
362							.expect("arguments are always valid; qed"),
363					),
364					&["compressed"],
365				)?,
366				registry,
367			)?,
368			code_size: prometheus::register(
369				prometheus::Histogram::with_opts(
370					prometheus::HistogramOpts::new(
371						"polkadot_parachain_candidate_validation_code_size",
372						"The size of the decompressed WASM validation blob used for checking a candidate",
373					)
374					.buckets(
375						prometheus::exponential_buckets(16384.0, 2.0, 10)
376							.expect("arguments are always valid; qed"),
377					),
378				)?,
379				registry,
380			)?,
381			exec_kind_selected: prometheus::register(
382				prometheus::CounterVec::new(
383					prometheus::Opts::new(
384						"polkadot_pvf_exec_kind_selected",
385						"The total number of selected execute kinds",
386					),
387					&["priority"],
388				)?,
389				registry,
390			)?,
391		};
392		Ok(Metrics(Some(inner)))
393	}
394}
395
396enum WorkerFlavor {
397	Prepare,
398	Execute,
399}
400
401impl WorkerFlavor {
402	fn as_label(&self) -> &'static str {
403		match *self {
404			WorkerFlavor::Prepare => "prepare",
405			WorkerFlavor::Execute => "execute",
406		}
407	}
408}
409
410pub(crate) struct WorkerRelatedMetrics<'a> {
411	metrics: &'a Metrics,
412	flavor: WorkerFlavor,
413}
414
415impl<'a> WorkerRelatedMetrics<'a> {
416	/// When the spawning of a worker started.
417	pub(crate) fn on_begin_spawn(&self) {
418		if let Some(metrics) = &self.metrics.0 {
419			metrics.worker_spawning.with_label_values(&[self.flavor.as_label()]).inc();
420		}
421	}
422
423	/// When the worker successfully spawned.
424	pub(crate) fn on_spawned(&self) {
425		if let Some(metrics) = &self.metrics.0 {
426			metrics.worker_spawned.with_label_values(&[self.flavor.as_label()]).inc();
427		}
428	}
429
430	/// When the worker was killed or died.
431	pub(crate) fn on_retired(&self) {
432		if let Some(metrics) = &self.metrics.0 {
433			metrics.worker_retired.with_label_values(&[self.flavor.as_label()]).inc();
434		}
435	}
436}