time/format_description/parse/
lexer.rs

1//! Lexer for parsing format descriptions.
2
3use core::iter;
4
5use super::{unused, Error, Location, Spanned, SpannedValue};
6
7/// An iterator over the lexed tokens.
8pub(super) struct Lexed<I: Iterator> {
9    /// The internal iterator.
10    iter: iter::Peekable<I>,
11}
12
13impl<I: Iterator> Iterator for Lexed<I> {
14    type Item = I::Item;
15
16    fn next(&mut self) -> Option<Self::Item> {
17        self.iter.next()
18    }
19}
20
21impl<'iter, 'token: 'iter, I: Iterator<Item = Result<Token<'token>, Error>> + 'iter> Lexed<I> {
22    /// Peek at the next item in the iterator.
23    pub(super) fn peek(&mut self) -> Option<&I::Item> {
24        self.iter.peek()
25    }
26
27    /// Consume the next token if it is whitespace.
28    pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
29        if let Some(&Ok(Token::ComponentPart {
30            kind: ComponentKind::Whitespace,
31            value,
32        })) = self.peek()
33        {
34            self.next(); // consume
35            Some(value)
36        } else {
37            None
38        }
39    }
40
41    /// Consume the next token if it is a component item that is not whitespace.
42    pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
43        if let Some(&Ok(Token::ComponentPart {
44            kind: ComponentKind::NotWhitespace,
45            value,
46        })) = self.peek()
47        {
48            self.next(); // consume
49            Some(value)
50        } else {
51            None
52        }
53    }
54
55    /// Consume the next token if it is an opening bracket.
56    pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> {
57        if let Some(&Ok(Token::Bracket {
58            kind: BracketKind::Opening,
59            location,
60        })) = self.peek()
61        {
62            self.next(); // consume
63            Some(location)
64        } else {
65            None
66        }
67    }
68
69    /// Peek at the next token if it is a closing bracket.
70    pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> {
71        if let Some(Ok(Token::Bracket {
72            kind: BracketKind::Closing,
73            location,
74        })) = self.peek()
75        {
76            Some(location)
77        } else {
78            None
79        }
80    }
81
82    /// Consume the next token if it is a closing bracket.
83    pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> {
84        if let Some(&Ok(Token::Bracket {
85            kind: BracketKind::Closing,
86            location,
87        })) = self.peek()
88        {
89            self.next(); // consume
90            Some(location)
91        } else {
92            None
93        }
94    }
95}
96
97/// A token emitted by the lexer. There is no semantic meaning at this stage.
98pub(super) enum Token<'a> {
99    /// A literal string, formatted and parsed as-is.
100    Literal(Spanned<&'a [u8]>),
101    /// An opening or closing bracket. May or may not be the start or end of a component.
102    Bracket {
103        /// Whether the bracket is opening or closing.
104        kind: BracketKind,
105        /// Where the bracket was in the format string.
106        location: Location,
107    },
108    /// One part of a component. This could be its name, a modifier, or whitespace.
109    ComponentPart {
110        /// Whether the part is whitespace or not.
111        kind: ComponentKind,
112        /// The part itself.
113        value: Spanned<&'a [u8]>,
114    },
115}
116
117/// What type of bracket is present.
118pub(super) enum BracketKind {
119    /// An opening bracket: `[`
120    Opening,
121    /// A closing bracket: `]`
122    Closing,
123}
124
125/// Indicates whether the component is whitespace or not.
126pub(super) enum ComponentKind {
127    #[allow(clippy::missing_docs_in_private_items)]
128    Whitespace,
129    #[allow(clippy::missing_docs_in_private_items)]
130    NotWhitespace,
131}
132
133/// Attach [`Location`] information to each byte in the iterator.
134fn attach_location<'item>(
135    iter: impl Iterator<Item = &'item u8>,
136) -> impl Iterator<Item = (&'item u8, Location)> {
137    let mut byte_pos = 0;
138
139    iter.map(move |byte| {
140        let location = Location { byte: byte_pos };
141        byte_pos += 1;
142        (byte, location)
143    })
144}
145
146/// Parse the string into a series of [`Token`]s.
147///
148/// `VERSION` controls the version of the format description that is being parsed. Currently, this
149/// must be 1 or 2.
150///
151/// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`.
152/// - When `VERSION` is 2, all escape sequences begin with `\`. The only characters that may
153///   currently follow are `\`, `[`, and `]`, all of which result in the literal character. All
154///   other characters result in a lex error.
155pub(super) fn lex<const VERSION: usize>(
156    mut input: &[u8],
157) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> {
158    validate_version!(VERSION);
159
160    let mut depth: u8 = 0;
161    let mut iter = attach_location(input.iter()).peekable();
162    let mut second_bracket_location = None;
163
164    let iter = iter::from_fn(move || {
165        // The flag is only set when version is zero.
166        if version!(..=1) {
167            // There is a flag set to emit the second half of an escaped bracket pair.
168            if let Some(location) = second_bracket_location.take() {
169                return Some(Ok(Token::Bracket {
170                    kind: BracketKind::Opening,
171                    location,
172                }));
173            }
174        }
175
176        Some(Ok(match iter.next()? {
177            // possible escape sequence
178            (b'\\', backslash_loc) if version!(2..) => {
179                match iter.next() {
180                    Some((b'\\' | b'[' | b']', char_loc)) => {
181                        // The escaped character is emitted as-is.
182                        let char = &input[1..2];
183                        input = &input[2..];
184                        if depth == 0 {
185                            Token::Literal(char.spanned(backslash_loc.to(char_loc)))
186                        } else {
187                            Token::ComponentPart {
188                                kind: ComponentKind::NotWhitespace,
189                                value: char.spanned(backslash_loc.to(char_loc)),
190                            }
191                        }
192                    }
193                    Some((_, loc)) => {
194                        return Some(Err(Error {
195                            _inner: unused(loc.error("invalid escape sequence")),
196                            public: crate::error::InvalidFormatDescription::Expected {
197                                what: "valid escape sequence",
198                                index: loc.byte as _,
199                            },
200                        }));
201                    }
202                    None => {
203                        return Some(Err(Error {
204                            _inner: unused(backslash_loc.error("unexpected end of input")),
205                            public: crate::error::InvalidFormatDescription::Expected {
206                                what: "valid escape sequence",
207                                index: backslash_loc.byte as _,
208                            },
209                        }));
210                    }
211                }
212            }
213            // potentially escaped opening bracket
214            (b'[', location) if version!(..=1) => {
215                if let Some((_, second_location)) = iter.next_if(|&(&byte, _)| byte == b'[') {
216                    // Escaped bracket. Store the location of the second so we can emit it later.
217                    second_bracket_location = Some(second_location);
218                    input = &input[2..];
219                } else {
220                    // opening bracket
221                    depth += 1;
222                    input = &input[1..];
223                }
224
225                Token::Bracket {
226                    kind: BracketKind::Opening,
227                    location,
228                }
229            }
230            // opening bracket
231            (b'[', location) => {
232                depth += 1;
233                input = &input[1..];
234
235                Token::Bracket {
236                    kind: BracketKind::Opening,
237                    location,
238                }
239            }
240            // closing bracket
241            (b']', location) if depth > 0 => {
242                depth -= 1;
243                input = &input[1..];
244
245                Token::Bracket {
246                    kind: BracketKind::Closing,
247                    location,
248                }
249            }
250            // literal
251            (_, start_location) if depth == 0 => {
252                let mut bytes = 1;
253                let mut end_location = start_location;
254
255                while let Some((_, location)) =
256                    iter.next_if(|&(&byte, _)| !((version!(2..) && byte == b'\\') || byte == b'['))
257                {
258                    end_location = location;
259                    bytes += 1;
260                }
261
262                let value = &input[..bytes];
263                input = &input[bytes..];
264
265                Token::Literal(value.spanned(start_location.to(end_location)))
266            }
267            // component part
268            (byte, start_location) => {
269                let mut bytes = 1;
270                let mut end_location = start_location;
271                let is_whitespace = byte.is_ascii_whitespace();
272
273                while let Some((_, location)) = iter.next_if(|&(byte, _)| {
274                    !matches!(byte, b'\\' | b'[' | b']')
275                        && is_whitespace == byte.is_ascii_whitespace()
276                }) {
277                    end_location = location;
278                    bytes += 1;
279                }
280
281                let value = &input[..bytes];
282                input = &input[bytes..];
283
284                Token::ComponentPart {
285                    kind: if is_whitespace {
286                        ComponentKind::Whitespace
287                    } else {
288                        ComponentKind::NotWhitespace
289                    },
290                    value: value.spanned(start_location.to(end_location)),
291                }
292            }
293        }))
294    });
295
296    Lexed {
297        iter: iter.peekable(),
298    }
299}