polkadot_availability_recovery/
metrics.rs

1// Copyright (C) Parity Technologies (UK) Ltd.
2// This file is part of Polkadot.
3
4// Polkadot is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// Polkadot is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13
14// You should have received a copy of the GNU General Public License
15// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
16
17use polkadot_node_subsystem::prometheus::HistogramVec;
18use polkadot_node_subsystem_util::metrics::{
19	self,
20	prometheus::{
21		self, prometheus::HistogramTimer, Counter, CounterVec, Histogram, Opts, PrometheusError,
22		Registry, U64,
23	},
24};
25
26/// Availability Distribution metrics.
27#[derive(Clone, Default)]
28pub struct Metrics(Option<MetricsInner>);
29
30#[derive(Clone)]
31struct MetricsInner {
32	/// Number of sent chunk requests.
33	///
34	/// Gets incremented on each sent chunk requests.
35	///
36	/// Split by chunk type:
37	/// - `regular_chunks`
38	/// - `systematic_chunks`
39	chunk_requests_issued: CounterVec<U64>,
40
41	/// Total number of bytes recovered
42	///
43	/// Gets incremented on each successful recovery
44	recovered_bytes_total: Counter<U64>,
45
46	/// A counter for finished chunk requests.
47	///
48	/// Split by the chunk type (`regular_chunks` or `systematic_chunks`)
49	///
50	/// Also split by result:
51	/// - `no_such_chunk` ... peer did not have the requested chunk
52	/// - `timeout` ... request timed out.
53	/// - `error` ... Some networking issue except timeout
54	/// - `invalid` ... Chunk was received, but not valid.
55	/// - `success`
56	chunk_requests_finished: CounterVec<U64>,
57
58	/// A counter for successful chunk requests, split by the network protocol version.
59	chunk_request_protocols: CounterVec<U64>,
60
61	/// Number of sent available data requests.
62	full_data_requests_issued: Counter<U64>,
63
64	/// Counter for finished available data requests.
65	///
66	/// Split by the result type:
67	///
68	/// - `no_such_data` ... peer did not have the requested data
69	/// - `timeout` ... request timed out.
70	/// - `error` ... Some networking issue except timeout
71	/// - `invalid` ... data was received, but not valid.
72	/// - `success`
73	full_data_requests_finished: CounterVec<U64>,
74
75	/// The duration of request to response.
76	///
77	/// Split by chunk type (`regular_chunks` or `systematic_chunks`).
78	time_chunk_request: HistogramVec,
79
80	/// The duration between the pure recovery and verification.
81	///
82	/// Split by recovery type (`regular_chunks`, `systematic_chunks` or `full_from_backers`).
83	time_erasure_recovery: HistogramVec,
84
85	/// How much time it takes to reconstruct the available data from chunks.
86	///
87	/// Split by chunk type (`regular_chunks` or `systematic_chunks`), as the algorithms are
88	/// different.
89	time_erasure_reconstruct: HistogramVec,
90
91	/// How much time it takes to re-encode the data into erasure chunks in order to verify
92	/// the root hash of the provided Merkle tree. See `reconstructed_data_matches_root`.
93	time_reencode_chunks: Histogram,
94
95	/// Time of a full recovery, including erasure decoding or until we gave
96	/// up.
97	time_full_recovery: Histogram,
98
99	/// Number of full recoveries that have been finished one way or the other.
100	///
101	/// Split by recovery `strategy_type` (`full_from_backers, systematic_chunks, regular_chunks,
102	/// all`). `all` is used for failed recoveries that tried all available strategies.
103	/// Also split by `result` type.
104	full_recoveries_finished: CounterVec<U64>,
105
106	/// Number of full recoveries that have been started on this subsystem.
107	///
108	/// Note: Those are only recoveries which could not get served locally already - so in other
109	/// words: Only real recoveries.
110	full_recoveries_started: Counter<U64>,
111}
112
113impl Metrics {
114	/// Create new dummy metrics, not reporting anything.
115	pub fn new_dummy() -> Self {
116		Metrics(None)
117	}
118
119	/// Increment counter for chunk requests.
120	pub fn on_chunk_request_issued(&self, chunk_type: &str) {
121		if let Some(metrics) = &self.0 {
122			metrics.chunk_requests_issued.with_label_values(&[chunk_type]).inc()
123		}
124	}
125
126	/// Increment counter for full data requests.
127	pub fn on_full_request_issued(&self) {
128		if let Some(metrics) = &self.0 {
129			metrics.full_data_requests_issued.inc()
130		}
131	}
132
133	/// A chunk request timed out.
134	pub fn on_chunk_request_timeout(&self, chunk_type: &str) {
135		if let Some(metrics) = &self.0 {
136			metrics
137				.chunk_requests_finished
138				.with_label_values(&[chunk_type, "timeout"])
139				.inc()
140		}
141	}
142
143	/// A full data request timed out.
144	pub fn on_full_request_timeout(&self) {
145		if let Some(metrics) = &self.0 {
146			metrics.full_data_requests_finished.with_label_values(&["timeout"]).inc()
147		}
148	}
149
150	/// A chunk request failed because validator did not have its chunk.
151	pub fn on_chunk_request_no_such_chunk(&self, chunk_type: &str) {
152		if let Some(metrics) = &self.0 {
153			metrics
154				.chunk_requests_finished
155				.with_label_values(&[chunk_type, "no_such_chunk"])
156				.inc()
157		}
158	}
159
160	/// A full data request failed because the validator did not have it.
161	pub fn on_full_request_no_such_data(&self) {
162		if let Some(metrics) = &self.0 {
163			metrics.full_data_requests_finished.with_label_values(&["no_such_data"]).inc()
164		}
165	}
166
167	/// A chunk request failed for some non timeout related network error.
168	pub fn on_chunk_request_error(&self, chunk_type: &str) {
169		if let Some(metrics) = &self.0 {
170			metrics.chunk_requests_finished.with_label_values(&[chunk_type, "error"]).inc()
171		}
172	}
173
174	/// A full data request failed for some non timeout related network error.
175	pub fn on_full_request_error(&self) {
176		if let Some(metrics) = &self.0 {
177			metrics.full_data_requests_finished.with_label_values(&["error"]).inc()
178		}
179	}
180
181	/// A chunk request succeeded, but was not valid.
182	pub fn on_chunk_request_invalid(&self, chunk_type: &str) {
183		if let Some(metrics) = &self.0 {
184			metrics
185				.chunk_requests_finished
186				.with_label_values(&[chunk_type, "invalid"])
187				.inc()
188		}
189	}
190
191	/// A full data request succeeded, but was not valid.
192	pub fn on_full_request_invalid(&self) {
193		if let Some(metrics) = &self.0 {
194			metrics.full_data_requests_finished.with_label_values(&["invalid"]).inc()
195		}
196	}
197
198	/// A chunk request succeeded.
199	pub fn on_chunk_request_succeeded(&self, chunk_type: &str) {
200		if let Some(metrics) = &self.0 {
201			metrics
202				.chunk_requests_finished
203				.with_label_values(&[chunk_type, "success"])
204				.inc()
205		}
206	}
207
208	/// A chunk response was received on the v1 protocol.
209	pub fn on_chunk_response_v1(&self) {
210		if let Some(metrics) = &self.0 {
211			metrics.chunk_request_protocols.with_label_values(&["v1"]).inc()
212		}
213	}
214
215	/// A chunk response was received on the v2 protocol.
216	pub fn on_chunk_response_v2(&self) {
217		if let Some(metrics) = &self.0 {
218			metrics.chunk_request_protocols.with_label_values(&["v2"]).inc()
219		}
220	}
221
222	/// A full data request succeeded.
223	pub fn on_full_request_succeeded(&self) {
224		if let Some(metrics) = &self.0 {
225			metrics.full_data_requests_finished.with_label_values(&["success"]).inc()
226		}
227	}
228
229	/// Get a timer to time request/response duration.
230	pub fn time_chunk_request(&self, chunk_type: &str) -> Option<HistogramTimer> {
231		self.0.as_ref().map(|metrics| {
232			metrics.time_chunk_request.with_label_values(&[chunk_type]).start_timer()
233		})
234	}
235
236	/// Get a timer to time erasure code recover.
237	pub fn time_erasure_recovery(&self, chunk_type: &str) -> Option<HistogramTimer> {
238		self.0.as_ref().map(|metrics| {
239			metrics.time_erasure_recovery.with_label_values(&[chunk_type]).start_timer()
240		})
241	}
242
243	/// Get a timer for available data reconstruction.
244	pub fn time_erasure_reconstruct(&self, chunk_type: &str) -> Option<HistogramTimer> {
245		self.0.as_ref().map(|metrics| {
246			metrics.time_erasure_reconstruct.with_label_values(&[chunk_type]).start_timer()
247		})
248	}
249
250	/// Get a timer to time chunk encoding.
251	pub fn time_reencode_chunks(&self) -> Option<HistogramTimer> {
252		self.0.as_ref().map(|metrics| metrics.time_reencode_chunks.start_timer())
253	}
254
255	/// Get a timer to measure the time of the complete recovery process.
256	pub fn time_full_recovery(&self) -> Option<HistogramTimer> {
257		self.0.as_ref().map(|metrics| metrics.time_full_recovery.start_timer())
258	}
259
260	/// A full recovery succeeded.
261	pub fn on_recovery_succeeded(&self, strategy_type: &str, bytes: usize) {
262		if let Some(metrics) = &self.0 {
263			metrics
264				.full_recoveries_finished
265				.with_label_values(&["success", strategy_type])
266				.inc();
267			metrics.recovered_bytes_total.inc_by(bytes as u64)
268		}
269	}
270
271	/// A full recovery failed (data not available).
272	pub fn on_recovery_failed(&self, strategy_type: &str) {
273		if let Some(metrics) = &self.0 {
274			metrics
275				.full_recoveries_finished
276				.with_label_values(&["failure", strategy_type])
277				.inc()
278		}
279	}
280
281	/// A full recovery failed (data was recovered, but invalid).
282	pub fn on_recovery_invalid(&self, strategy_type: &str) {
283		if let Some(metrics) = &self.0 {
284			metrics
285				.full_recoveries_finished
286				.with_label_values(&["invalid", strategy_type])
287				.inc()
288		}
289	}
290
291	/// A recover was started.
292	pub fn on_recovery_started(&self) {
293		if let Some(metrics) = &self.0 {
294			metrics.full_recoveries_started.inc()
295		}
296	}
297}
298
299impl metrics::Metrics for Metrics {
300	fn try_register(registry: &Registry) -> Result<Self, PrometheusError> {
301		let metrics = MetricsInner {
302			chunk_requests_issued: prometheus::register(
303				CounterVec::new(
304					Opts::new("polkadot_parachain_availability_recovery_chunk_requests_issued",
305					"Total number of issued chunk requests."),
306					&["type"]
307				)?,
308				registry,
309			)?,
310			full_data_requests_issued: prometheus::register(
311				Counter::new(
312					"polkadot_parachain_availability_recovery_full_data_requests_issued",
313					"Total number of issued full data requests.",
314				)?,
315				registry,
316			)?,
317			recovered_bytes_total: prometheus::register(
318				Counter::new(
319					"polkadot_parachain_availability_recovery_bytes_total",
320					"Total number of bytes recovered",
321				)?,
322				registry,
323			)?,
324			chunk_requests_finished: prometheus::register(
325				CounterVec::new(
326					Opts::new(
327						"polkadot_parachain_availability_recovery_chunk_requests_finished",
328						"Total number of chunk requests finished.",
329					),
330					&["result", "type"],
331				)?,
332				registry,
333			)?,
334			chunk_request_protocols: prometheus::register(
335				CounterVec::new(
336					Opts::new(
337						"polkadot_parachain_availability_recovery_chunk_request_protocols",
338						"Total number of successful chunk requests, mapped by the protocol version (v1 or v2).",
339					),
340					&["protocol"],
341				)?,
342				registry,
343			)?,
344			full_data_requests_finished: prometheus::register(
345				CounterVec::new(
346					Opts::new(
347						"polkadot_parachain_availability_recovery_full_data_requests_finished",
348						"Total number of full data requests finished.",
349					),
350					&["result"],
351				)?,
352				registry,
353			)?,
354			time_chunk_request: prometheus::register(
355				prometheus::HistogramVec::new(prometheus::HistogramOpts::new(
356					"polkadot_parachain_availability_recovery_time_chunk_request",
357					"Time spent waiting for a response to a chunk request",
358				), &["type"])?,
359				registry,
360			)?,
361			time_erasure_recovery: prometheus::register(
362				prometheus::HistogramVec::new(prometheus::HistogramOpts::new(
363					"polkadot_parachain_availability_recovery_time_erasure_recovery",
364					"Time spent to recover the erasure code and verify the merkle root by re-encoding as erasure chunks",
365				), &["type"])?,
366				registry,
367			)?,
368			time_erasure_reconstruct: prometheus::register(
369				prometheus::HistogramVec::new(prometheus::HistogramOpts::new(
370					"polkadot_parachain_availability_recovery_time_erasure_reconstruct",
371					"Time spent to reconstruct the data from chunks",
372				), &["type"])?,
373				registry,
374			)?,
375			time_reencode_chunks: prometheus::register(
376				prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
377					"polkadot_parachain_availability_reencode_chunks",
378					"Time spent re-encoding the data as erasure chunks",
379				))?,
380				registry,
381			)?,
382			time_full_recovery: prometheus::register(
383				prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
384					"polkadot_parachain_availability_recovery_time_total",
385					"Time a full recovery process took, either until failure or successful erasure decoding.",
386				))?,
387				registry,
388			)?,
389			full_recoveries_finished: prometheus::register(
390				CounterVec::new(
391					Opts::new(
392						"polkadot_parachain_availability_recovery_recoveries_finished",
393						"Total number of recoveries that finished.",
394					),
395					&["result", "strategy_type"],
396				)?,
397				registry,
398			)?,
399			full_recoveries_started: prometheus::register(
400				Counter::new(
401					"polkadot_parachain_availability_recovery_recoveries_started",
402					"Total number of started recoveries.",
403				)?,
404				registry,
405			)?,
406		};
407		Ok(Metrics(Some(metrics)))
408	}
409}
polkadot_availability_recovery/metrics.rs

polkadot_availability_recovery/
metrics.rs