polkadot_node_network_protocol/request_response/mod.rs
1// Copyright (C) Parity Technologies (UK) Ltd.
2// This file is part of Polkadot.
3
4// Polkadot is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// Polkadot is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13
14// You should have received a copy of the GNU General Public License
15// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
16
17//! Overview over request/responses as used in `Polkadot`.
18//!
19//! `enum Protocol` .... List of all supported protocols.
20//!
21//! `enum Requests` .... List of all supported requests, each entry matches one in protocols, but
22//! has the actual request as payload.
23//!
24//! `struct IncomingRequest` .... wrapper for incoming requests, containing a sender for sending
25//! responses.
26//!
27//! `struct OutgoingRequest` .... wrapper for outgoing requests, containing a sender used by the
28//! networking code for delivering responses/delivery errors.
29//!
30//! `trait IsRequest` .... A trait describing a particular request. It is used for gathering meta
31//! data, like what is the corresponding response type.
32//!
33//! ## Versioning
34//!
35//! Versioning for request-response protocols can be done in multiple ways.
36//!
37//! If you're just changing the protocol name but the binary payloads are the same, just add a new
38//! `fallback_name` to the protocol config.
39//!
40//! One way in which versioning has historically been achieved for req-response protocols is to
41//! bundle the new req-resp version with an upgrade of a notifications protocol. The subsystem would
42//! then know which request version to use based on stored data about the peer's notifications
43//! protocol version.
44//!
45//! When bumping a notifications protocol version is not needed/desirable, you may add a new
46//! req-resp protocol and set the old request as a fallback (see
47//! `OutgoingRequest::new_with_fallback`). A request with the new version will be attempted and if
48//! the protocol is refused by the peer, the fallback protocol request will be used.
49//! Information about the actually used protocol will be returned alongside the raw response, so
50//! that you know how to decode it.
51
52use std::{collections::HashMap, time::Duration, u64};
53
54use polkadot_primitives::MAX_CODE_SIZE;
55use sc_network::{NetworkBackend, MAX_RESPONSE_SIZE};
56use sp_runtime::traits::Block;
57use strum::{EnumIter, IntoEnumIterator};
58
59pub use sc_network::{config as network, config::RequestResponseConfig, ProtocolName};
60
61/// Everything related to handling of incoming requests.
62pub mod incoming;
63/// Everything related to handling of outgoing requests.
64pub mod outgoing;
65
66pub use incoming::{IncomingRequest, IncomingRequestReceiver};
67
68pub use outgoing::{OutgoingRequest, OutgoingResult, Recipient, Requests, ResponseSender};
69
70///// Multiplexer for incoming requests.
71// pub mod multiplexer;
72
73/// Actual versioned requests and responses that are sent over the wire.
74pub mod v1;
75
76/// Actual versioned requests and responses that are sent over the wire.
77pub mod v2;
78
79/// A protocol per subsystem seems to make the most sense, this way we don't need any dispatching
80/// within protocols.
81#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, EnumIter)]
82pub enum Protocol {
83 /// Protocol for chunk fetching, used by availability distribution and availability recovery.
84 ChunkFetchingV1,
85 /// Protocol for fetching collations from collators.
86 CollationFetchingV1,
87 /// Protocol for fetching collations from collators when async backing is enabled.
88 CollationFetchingV2,
89 /// Protocol for fetching seconded PoVs from validators of the same group.
90 PoVFetchingV1,
91 /// Protocol for fetching available data.
92 AvailableDataFetchingV1,
93 /// Fetching of statements that are too large for gossip.
94 StatementFetchingV1,
95 /// Sending of dispute statements with application level confirmations.
96 DisputeSendingV1,
97
98 /// Protocol for requesting candidates with attestations in statement distribution
99 /// when async backing is enabled.
100 AttestedCandidateV2,
101
102 /// Protocol for chunk fetching version 2, used by availability distribution and availability
103 /// recovery.
104 ChunkFetchingV2,
105}
106
107/// Minimum bandwidth we expect for validators - 500Mbit/s is the recommendation, so approximately
108/// 50MB per second:
109const MIN_BANDWIDTH_BYTES: u64 = 50 * 1024 * 1024;
110
111/// Default request timeout in seconds.
112///
113/// When decreasing this value, take into account that the very first request might need to open a
114/// connection, which can be slow. If this causes problems, we should ensure connectivity via peer
115/// sets.
116#[allow(dead_code)]
117const DEFAULT_REQUEST_TIMEOUT: Duration = Duration::from_secs(3);
118
119/// Request timeout where we can assume the connection is already open (e.g. we have peers in a
120/// peer set as well).
121const DEFAULT_REQUEST_TIMEOUT_CONNECTED: Duration = Duration::from_secs(1);
122
123/// Timeout for requesting availability chunks.
124pub const CHUNK_REQUEST_TIMEOUT: Duration = DEFAULT_REQUEST_TIMEOUT_CONNECTED;
125
126/// This timeout is based on what seems sensible from a time budget perspective, considering 6
127/// second block time. This is going to be tough, if we have multiple forks and large PoVs, but we
128/// only have so much time.
129const POV_REQUEST_TIMEOUT_CONNECTED: Duration = Duration::from_millis(1200);
130
131/// We want timeout statement requests fast, so we don't waste time on slow nodes. Responders will
132/// try their best to either serve within that timeout or return an error immediately. (We need to
133/// fit statement distribution within a block of 6 seconds.)
134const STATEMENTS_TIMEOUT: Duration = Duration::from_secs(1);
135
136/// We want attested candidate requests to time out relatively fast,
137/// because slow requests will bottleneck the backing system. Ideally, we'd have
138/// an adaptive timeout based on the candidate size, because there will be a lot of variance
139/// in candidate sizes: candidates with no code and no messages vs candidates with code
140/// and messages.
141///
142/// We supply leniency because there are often large candidates and asynchronous
143/// backing allows them to be included over a longer window of time. Exponential back-off
144/// up to a maximum of 10 seconds would be ideal, but isn't supported by the
145/// infrastructure here yet: see https://github.com/paritytech/polkadot/issues/6009
146const ATTESTED_CANDIDATE_TIMEOUT: Duration = Duration::from_millis(2500);
147
148/// We don't want a slow peer to slow down all the others, at the same time we want to get out the
149/// data quickly in full to at least some peers (as this will reduce load on us as they then can
150/// start serving the data). So this value is a trade-off. 3 seems to be sensible. So we would need
151/// to have 3 slow nodes connected, to delay transfer for others by `STATEMENTS_TIMEOUT`.
152pub const MAX_PARALLEL_STATEMENT_REQUESTS: u32 = 3;
153
154/// We don't want a slow peer to slow down all the others, at the same time we want to get out the
155/// data quickly in full to at least some peers (as this will reduce load on us as they then can
156/// start serving the data). So this value is a tradeoff. 5 seems to be sensible. So we would need
157/// to have 5 slow nodes connected, to delay transfer for others by `ATTESTED_CANDIDATE_TIMEOUT`.
158pub const MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS: u32 = 5;
159
160/// Response size limit for responses of POV like data.
161///
162/// Same as what we use in substrate networking.
163const POV_RESPONSE_SIZE: u64 = MAX_RESPONSE_SIZE;
164
165/// Maximum response sizes for `StatementFetchingV1`.
166///
167/// This is `MAX_CODE_SIZE` plus some additional space for protocol overhead.
168const STATEMENT_RESPONSE_SIZE: u64 = MAX_CODE_SIZE as u64 + 10_000;
169
170/// Maximum response sizes for `AttestedCandidateV2`.
171///
172/// This is `MAX_CODE_SIZE` plus some additional space for protocol overhead and
173/// additional backing statements.
174const ATTESTED_CANDIDATE_RESPONSE_SIZE: u64 = MAX_CODE_SIZE as u64 + 100_000;
175
176/// We can have relative large timeouts here, there is no value of hitting a
177/// timeout as we want to get statements through to each node in any case.
178pub const DISPUTE_REQUEST_TIMEOUT: Duration = Duration::from_secs(12);
179
180impl Protocol {
181 /// Get a configuration for a given Request response protocol.
182 ///
183 /// Returns a `ProtocolConfig` for this protocol.
184 /// Use this if you plan only to send requests for this protocol.
185 pub fn get_outbound_only_config<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
186 self,
187 req_protocol_names: &ReqProtocolNames,
188 ) -> N::RequestResponseProtocolConfig {
189 self.create_config::<B, N>(req_protocol_names, None)
190 }
191
192 /// Get a configuration for a given Request response protocol.
193 ///
194 /// Returns a receiver for messages received on this protocol and the requested
195 /// `ProtocolConfig`.
196 pub fn get_config<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
197 self,
198 req_protocol_names: &ReqProtocolNames,
199 ) -> (async_channel::Receiver<network::IncomingRequest>, N::RequestResponseProtocolConfig) {
200 let (tx, rx) = async_channel::bounded(self.get_channel_size());
201 let cfg = self.create_config::<B, N>(req_protocol_names, Some(tx));
202 (rx, cfg)
203 }
204
205 fn create_config<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
206 self,
207 req_protocol_names: &ReqProtocolNames,
208 tx: Option<async_channel::Sender<network::IncomingRequest>>,
209 ) -> N::RequestResponseProtocolConfig {
210 let name = req_protocol_names.get_name(self);
211 let legacy_names = self.get_legacy_name().into_iter().map(Into::into).collect();
212 match self {
213 Protocol::ChunkFetchingV1 | Protocol::ChunkFetchingV2 => N::request_response_config(
214 name,
215 legacy_names,
216 1_000,
217 POV_RESPONSE_SIZE,
218 // We are connected to all validators:
219 CHUNK_REQUEST_TIMEOUT,
220 tx,
221 ),
222 Protocol::CollationFetchingV1 | Protocol::CollationFetchingV2 =>
223 N::request_response_config(
224 name,
225 legacy_names,
226 1_000,
227 POV_RESPONSE_SIZE,
228 // Taken from initial implementation in collator protocol:
229 POV_REQUEST_TIMEOUT_CONNECTED,
230 tx,
231 ),
232 Protocol::PoVFetchingV1 => N::request_response_config(
233 name,
234 legacy_names,
235 1_000,
236 POV_RESPONSE_SIZE,
237 POV_REQUEST_TIMEOUT_CONNECTED,
238 tx,
239 ),
240 Protocol::AvailableDataFetchingV1 => N::request_response_config(
241 name,
242 legacy_names,
243 1_000,
244 // Available data size is dominated by the PoV size.
245 POV_RESPONSE_SIZE,
246 POV_REQUEST_TIMEOUT_CONNECTED,
247 tx,
248 ),
249 Protocol::StatementFetchingV1 => N::request_response_config(
250 name,
251 legacy_names,
252 1_000,
253 // Available data size is dominated code size.
254 STATEMENT_RESPONSE_SIZE,
255 // We need statement fetching to be fast and will try our best at the responding
256 // side to answer requests within that timeout, assuming a bandwidth of 500Mbit/s
257 // - which is the recommended minimum bandwidth for nodes on Kusama as of April
258 // 2021.
259 // Responders will reject requests, if it is unlikely they can serve them within
260 // the timeout, so the requester can immediately try another node, instead of
261 // waiting for timeout on an overloaded node. Fetches from slow nodes will likely
262 // fail, but this is desired, so we can quickly move on to a faster one - we should
263 // also decrease its reputation.
264 Duration::from_secs(1),
265 tx,
266 ),
267 Protocol::DisputeSendingV1 => N::request_response_config(
268 name,
269 legacy_names,
270 1_000,
271 // Responses are just confirmation, in essence not even a bit. So 100 seems
272 // plenty.
273 100,
274 DISPUTE_REQUEST_TIMEOUT,
275 tx,
276 ),
277 Protocol::AttestedCandidateV2 => N::request_response_config(
278 name,
279 legacy_names,
280 1_000,
281 ATTESTED_CANDIDATE_RESPONSE_SIZE,
282 ATTESTED_CANDIDATE_TIMEOUT,
283 tx,
284 ),
285 }
286 }
287
288 // Channel sizes for the supported protocols.
289 fn get_channel_size(self) -> usize {
290 match self {
291 // Hundreds of validators will start requesting their chunks once they see a candidate
292 // awaiting availability on chain. Given that they will see that block at different
293 // times (due to network delays), 100 seems big enough to accommodate for "bursts",
294 // assuming we can service requests relatively quickly, which would need to be measured
295 // as well.
296 Protocol::ChunkFetchingV1 | Protocol::ChunkFetchingV2 => 100,
297 // 10 seems reasonable, considering group sizes of max 10 validators.
298 Protocol::CollationFetchingV1 | Protocol::CollationFetchingV2 => 10,
299 // 10 seems reasonable, considering group sizes of max 10 validators.
300 Protocol::PoVFetchingV1 => 10,
301 // Validators are constantly self-selecting to request available data which may lead
302 // to constant load and occasional burstiness.
303 Protocol::AvailableDataFetchingV1 => 100,
304 // Our queue size approximation is how many blocks of the size of
305 // a runtime we can transfer within a statements timeout, minus the requests we handle
306 // in parallel.
307 Protocol::StatementFetchingV1 => {
308 // We assume we can utilize up to 70% of the available bandwidth for statements.
309 // This is just a guess/estimate, with the following considerations: If we are
310 // faster than that, queue size will stay low anyway, even if not - requesters will
311 // get an immediate error, but if we are slower, requesters will run in a timeout -
312 // wasting precious time.
313 let available_bandwidth = 7 * MIN_BANDWIDTH_BYTES / 10;
314 let size = u64::saturating_sub(
315 STATEMENTS_TIMEOUT.as_millis() as u64 * available_bandwidth /
316 (1000 * MAX_CODE_SIZE as u64),
317 MAX_PARALLEL_STATEMENT_REQUESTS as u64,
318 );
319 debug_assert!(
320 size > 0,
321 "We should have a channel size greater zero, otherwise we won't accept any requests."
322 );
323 size as usize
324 },
325 // Incoming requests can get bursty, we should also be able to handle them fast on
326 // average, so something in the ballpark of 100 should be fine. Nodes will retry on
327 // failure, so having a good value here is mostly about performance tuning.
328 Protocol::DisputeSendingV1 => 100,
329
330 Protocol::AttestedCandidateV2 => {
331 // We assume we can utilize up to 70% of the available bandwidth for statements.
332 // This is just a guess/estimate, with the following considerations: If we are
333 // faster than that, queue size will stay low anyway, even if not - requesters will
334 // get an immediate error, but if we are slower, requesters will run in a timeout -
335 // wasting precious time.
336 let available_bandwidth = 7 * MIN_BANDWIDTH_BYTES / 10;
337 let size = u64::saturating_sub(
338 ATTESTED_CANDIDATE_TIMEOUT.as_millis() as u64 * available_bandwidth /
339 (1000 * MAX_CODE_SIZE as u64),
340 MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as u64,
341 );
342 debug_assert!(
343 size > 0,
344 "We should have a channel size greater zero, otherwise we won't accept any requests."
345 );
346 size as usize
347 },
348 }
349 }
350
351 /// Legacy protocol name associated with each peer set, if any.
352 /// The request will be tried on this legacy protocol name if the remote refuses to speak the
353 /// protocol.
354 const fn get_legacy_name(self) -> Option<&'static str> {
355 match self {
356 Protocol::ChunkFetchingV1 => Some("/polkadot/req_chunk/1"),
357 Protocol::CollationFetchingV1 => Some("/polkadot/req_collation/1"),
358 Protocol::PoVFetchingV1 => Some("/polkadot/req_pov/1"),
359 Protocol::AvailableDataFetchingV1 => Some("/polkadot/req_available_data/1"),
360 Protocol::StatementFetchingV1 => Some("/polkadot/req_statement/1"),
361 Protocol::DisputeSendingV1 => Some("/polkadot/send_dispute/1"),
362
363 // Introduced after legacy names became legacy.
364 Protocol::AttestedCandidateV2 => None,
365 Protocol::CollationFetchingV2 => None,
366 Protocol::ChunkFetchingV2 => None,
367 }
368 }
369}
370
371/// Common properties of any `Request`.
372pub trait IsRequest {
373 /// Each request has a corresponding `Response`.
374 type Response;
375
376 /// What protocol this `Request` implements.
377 const PROTOCOL: Protocol;
378}
379
380/// Type for getting on the wire [`Protocol`] names using genesis hash & fork id.
381#[derive(Clone)]
382pub struct ReqProtocolNames {
383 names: HashMap<Protocol, ProtocolName>,
384}
385
386impl ReqProtocolNames {
387 /// Construct [`ReqProtocolNames`] from `genesis_hash` and `fork_id`.
388 pub fn new<Hash: AsRef<[u8]>>(genesis_hash: Hash, fork_id: Option<&str>) -> Self {
389 let mut names = HashMap::new();
390 for protocol in Protocol::iter() {
391 names.insert(protocol, Self::generate_name(protocol, &genesis_hash, fork_id));
392 }
393 Self { names }
394 }
395
396 /// Get on the wire [`Protocol`] name.
397 pub fn get_name(&self, protocol: Protocol) -> ProtocolName {
398 self.names
399 .get(&protocol)
400 .expect("All `Protocol` enum variants are added above via `strum`; qed")
401 .clone()
402 }
403
404 /// Protocol name of this protocol based on `genesis_hash` and `fork_id`.
405 fn generate_name<Hash: AsRef<[u8]>>(
406 protocol: Protocol,
407 genesis_hash: &Hash,
408 fork_id: Option<&str>,
409 ) -> ProtocolName {
410 let prefix = if let Some(fork_id) = fork_id {
411 format!("/{}/{}", hex::encode(genesis_hash), fork_id)
412 } else {
413 format!("/{}", hex::encode(genesis_hash))
414 };
415
416 let short_name = match protocol {
417 // V1:
418 Protocol::ChunkFetchingV1 => "/req_chunk/1",
419 Protocol::CollationFetchingV1 => "/req_collation/1",
420 Protocol::PoVFetchingV1 => "/req_pov/1",
421 Protocol::AvailableDataFetchingV1 => "/req_available_data/1",
422 Protocol::StatementFetchingV1 => "/req_statement/1",
423 Protocol::DisputeSendingV1 => "/send_dispute/1",
424
425 // V2:
426 Protocol::CollationFetchingV2 => "/req_collation/2",
427 Protocol::AttestedCandidateV2 => "/req_attested_candidate/2",
428 Protocol::ChunkFetchingV2 => "/req_chunk/2",
429 };
430
431 format!("{}{}", prefix, short_name).into()
432 }
433}