url/
parser.rs

1// Copyright 2013-2016 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use std::error::Error;
10use std::fmt::{self, Formatter, Write};
11use std::str;
12
13use crate::host::{Host, HostInternal};
14use crate::Url;
15use form_urlencoded::EncodingOverride;
16use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS};
17
18/// https://url.spec.whatwg.org/#fragment-percent-encode-set
19const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
20
21/// https://url.spec.whatwg.org/#path-percent-encode-set
22const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}');
23
24/// https://url.spec.whatwg.org/#userinfo-percent-encode-set
25pub(crate) const USERINFO: &AsciiSet = &PATH
26    .add(b'/')
27    .add(b':')
28    .add(b';')
29    .add(b'=')
30    .add(b'@')
31    .add(b'[')
32    .add(b'\\')
33    .add(b']')
34    .add(b'^')
35    .add(b'|');
36
37pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(b'%');
38
39// The backslash (\) character is treated as a path separator in special URLs
40// so it needs to be additionally escaped in that case.
41pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(b'\\');
42
43// https://url.spec.whatwg.org/#query-state
44const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(b'>');
45const SPECIAL_QUERY: &AsciiSet = &QUERY.add(b'\'');
46
47pub type ParseResult<T> = Result<T, ParseError>;
48
49macro_rules! simple_enum_error {
50    ($($name: ident => $description: expr,)+) => {
51        /// Errors that can occur during parsing.
52        ///
53        /// This may be extended in the future so exhaustive matching is
54        /// discouraged with an unused variant.
55        #[derive(PartialEq, Eq, Clone, Copy, Debug)]
56        #[non_exhaustive]
57        pub enum ParseError {
58            $(
59                $name,
60            )+
61        }
62
63        impl fmt::Display for ParseError {
64            fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result {
65                match *self {
66                    $(
67                        ParseError::$name => fmt.write_str($description),
68                    )+
69                }
70            }
71        }
72    }
73}
74
75impl Error for ParseError {}
76
77simple_enum_error! {
78    EmptyHost => "empty host",
79    IdnaError => "invalid international domain name",
80    InvalidPort => "invalid port number",
81    InvalidIpv4Address => "invalid IPv4 address",
82    InvalidIpv6Address => "invalid IPv6 address",
83    InvalidDomainCharacter => "invalid domain character",
84    RelativeUrlWithoutBase => "relative URL without a base",
85    RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base",
86    SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set",
87    Overflow => "URLs more than 4 GB are not supported",
88}
89
90impl From<::idna::Errors> for ParseError {
91    fn from(_: ::idna::Errors) -> ParseError {
92        ParseError::IdnaError
93    }
94}
95
96macro_rules! syntax_violation_enum {
97    ($($name: ident => $description: literal,)+) => {
98        /// Non-fatal syntax violations that can occur during parsing.
99        ///
100        /// This may be extended in the future so exhaustive matching is
101        /// forbidden.
102        #[derive(PartialEq, Eq, Clone, Copy, Debug)]
103        #[non_exhaustive]
104        pub enum SyntaxViolation {
105            $(
106                /// ```text
107                #[doc = $description]
108                /// ```
109                $name,
110            )+
111        }
112
113        impl SyntaxViolation {
114            pub fn description(&self) -> &'static str {
115                match *self {
116                    $(
117                        SyntaxViolation::$name => $description,
118                    )+
119                }
120            }
121        }
122    }
123}
124
125syntax_violation_enum! {
126    Backslash => "backslash",
127    C0SpaceIgnored =>
128        "leading or trailing control or space character are ignored in URLs",
129    EmbeddedCredentials =>
130        "embedding authentication information (username or password) \
131         in an URL is not recommended",
132    ExpectedDoubleSlash => "expected //",
133    ExpectedFileDoubleSlash => "expected // after file:",
134    FileWithHostAndWindowsDrive => "file: with host and Windows drive letter",
135    NonUrlCodePoint => "non-URL code point",
136    NullInFragment => "NULL characters are ignored in URL fragment identifiers",
137    PercentDecode => "expected 2 hex digits after %",
138    TabOrNewlineIgnored => "tabs or newlines are ignored in URLs",
139    UnencodedAtSign => "unencoded @ sign in username or password",
140}
141
142impl fmt::Display for SyntaxViolation {
143    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
144        fmt::Display::fmt(self.description(), f)
145    }
146}
147
148#[derive(Copy, Clone, PartialEq, Eq)]
149pub enum SchemeType {
150    File,
151    SpecialNotFile,
152    NotSpecial,
153}
154
155impl SchemeType {
156    pub fn is_special(&self) -> bool {
157        !matches!(*self, SchemeType::NotSpecial)
158    }
159
160    pub fn is_file(&self) -> bool {
161        matches!(*self, SchemeType::File)
162    }
163}
164
165impl<T: AsRef<str>> From<T> for SchemeType {
166    fn from(s: T) -> Self {
167        match s.as_ref() {
168            "http" | "https" | "ws" | "wss" | "ftp" => SchemeType::SpecialNotFile,
169            "file" => SchemeType::File,
170            _ => SchemeType::NotSpecial,
171        }
172    }
173}
174
175pub fn default_port(scheme: &str) -> Option<u16> {
176    match scheme {
177        "http" | "ws" => Some(80),
178        "https" | "wss" => Some(443),
179        "ftp" => Some(21),
180        _ => None,
181    }
182}
183
184#[derive(Clone, Debug)]
185pub struct Input<'i> {
186    chars: str::Chars<'i>,
187}
188
189impl<'i> Input<'i> {
190    pub fn new_no_trim(input: &'i str) -> Self {
191        Input {
192            chars: input.chars(),
193        }
194    }
195
196    pub fn new_trim_tab_and_newlines(
197        original_input: &'i str,
198        vfn: Option<&dyn Fn(SyntaxViolation)>,
199    ) -> Self {
200        let input = original_input.trim_matches(ascii_tab_or_new_line);
201        if let Some(vfn) = vfn {
202            if input.len() < original_input.len() {
203                vfn(SyntaxViolation::C0SpaceIgnored)
204            }
205            if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
206                vfn(SyntaxViolation::TabOrNewlineIgnored)
207            }
208        }
209        Input {
210            chars: input.chars(),
211        }
212    }
213
214    pub fn new_trim_c0_control_and_space(
215        original_input: &'i str,
216        vfn: Option<&dyn Fn(SyntaxViolation)>,
217    ) -> Self {
218        let input = original_input.trim_matches(c0_control_or_space);
219        if let Some(vfn) = vfn {
220            if input.len() < original_input.len() {
221                vfn(SyntaxViolation::C0SpaceIgnored)
222            }
223            if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
224                vfn(SyntaxViolation::TabOrNewlineIgnored)
225            }
226        }
227        Input {
228            chars: input.chars(),
229        }
230    }
231
232    #[inline]
233    pub fn is_empty(&self) -> bool {
234        self.clone().next().is_none()
235    }
236
237    #[inline]
238    fn starts_with<P: Pattern>(&self, p: P) -> bool {
239        p.split_prefix(&mut self.clone())
240    }
241
242    #[inline]
243    pub fn split_prefix<P: Pattern>(&self, p: P) -> Option<Self> {
244        let mut remaining = self.clone();
245        if p.split_prefix(&mut remaining) {
246            Some(remaining)
247        } else {
248            None
249        }
250    }
251
252    #[inline]
253    fn split_first(&self) -> (Option<char>, Self) {
254        let mut remaining = self.clone();
255        (remaining.next(), remaining)
256    }
257
258    #[inline]
259    fn count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self) {
260        let mut count = 0;
261        let mut remaining = self.clone();
262        loop {
263            let mut input = remaining.clone();
264            if matches!(input.next(), Some(c) if f(c)) {
265                remaining = input;
266                count += 1;
267            } else {
268                return (count, remaining);
269            }
270        }
271    }
272
273    #[inline]
274    fn next_utf8(&mut self) -> Option<(char, &'i str)> {
275        loop {
276            let utf8 = self.chars.as_str();
277            match self.chars.next() {
278                Some(c) => {
279                    if !matches!(c, '\t' | '\n' | '\r') {
280                        return Some((c, &utf8[..c.len_utf8()]));
281                    }
282                }
283                None => return None,
284            }
285        }
286    }
287}
288
289pub trait Pattern {
290    fn split_prefix(self, input: &mut Input) -> bool;
291}
292
293impl Pattern for char {
294    fn split_prefix(self, input: &mut Input) -> bool {
295        input.next() == Some(self)
296    }
297}
298
299impl<'a> Pattern for &'a str {
300    fn split_prefix(self, input: &mut Input) -> bool {
301        for c in self.chars() {
302            if input.next() != Some(c) {
303                return false;
304            }
305        }
306        true
307    }
308}
309
310impl<F: FnMut(char) -> bool> Pattern for F {
311    fn split_prefix(self, input: &mut Input) -> bool {
312        input.next().map_or(false, self)
313    }
314}
315
316impl<'i> Iterator for Input<'i> {
317    type Item = char;
318    fn next(&mut self) -> Option<char> {
319        self.chars
320            .by_ref()
321            .find(|&c| !matches!(c, '\t' | '\n' | '\r'))
322    }
323}
324
325pub struct Parser<'a> {
326    pub serialization: String,
327    pub base_url: Option<&'a Url>,
328    pub query_encoding_override: EncodingOverride<'a>,
329    pub violation_fn: Option<&'a dyn Fn(SyntaxViolation)>,
330    pub context: Context,
331}
332
333#[derive(PartialEq, Eq, Copy, Clone)]
334pub enum Context {
335    UrlParser,
336    Setter,
337    PathSegmentSetter,
338}
339
340impl<'a> Parser<'a> {
341    fn log_violation(&self, v: SyntaxViolation) {
342        if let Some(f) = self.violation_fn {
343            f(v)
344        }
345    }
346
347    fn log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool) {
348        if let Some(f) = self.violation_fn {
349            if test() {
350                f(v)
351            }
352        }
353    }
354
355    pub fn for_setter(serialization: String) -> Parser<'a> {
356        Parser {
357            serialization,
358            base_url: None,
359            query_encoding_override: None,
360            violation_fn: None,
361            context: Context::Setter,
362        }
363    }
364
365    /// https://url.spec.whatwg.org/#concept-basic-url-parser
366    pub fn parse_url(mut self, input: &str) -> ParseResult<Url> {
367        let input = Input::new_trim_c0_control_and_space(input, self.violation_fn);
368        if let Ok(remaining) = self.parse_scheme(input.clone()) {
369            return self.parse_with_scheme(remaining);
370        }
371
372        // No-scheme state
373        if let Some(base_url) = self.base_url {
374            if input.starts_with('#') {
375                self.fragment_only(base_url, input)
376            } else if base_url.cannot_be_a_base() {
377                Err(ParseError::RelativeUrlWithCannotBeABaseBase)
378            } else {
379                let scheme_type = SchemeType::from(base_url.scheme());
380                if scheme_type.is_file() {
381                    self.parse_file(input, scheme_type, Some(base_url))
382                } else {
383                    self.parse_relative(input, scheme_type, base_url)
384                }
385            }
386        } else {
387            Err(ParseError::RelativeUrlWithoutBase)
388        }
389    }
390
391    pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()> {
392        if input.is_empty() || !input.starts_with(ascii_alpha) {
393            return Err(());
394        }
395        debug_assert!(self.serialization.is_empty());
396        while let Some(c) = input.next() {
397            match c {
398                'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
399                    self.serialization.push(c.to_ascii_lowercase())
400                }
401                ':' => return Ok(input),
402                _ => {
403                    self.serialization.clear();
404                    return Err(());
405                }
406            }
407        }
408        // EOF before ':'
409        if self.context == Context::Setter {
410            Ok(input)
411        } else {
412            self.serialization.clear();
413            Err(())
414        }
415    }
416
417    fn parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url> {
418        use crate::SyntaxViolation::{ExpectedDoubleSlash, ExpectedFileDoubleSlash};
419        let scheme_end = to_u32(self.serialization.len())?;
420        let scheme_type = SchemeType::from(&self.serialization);
421        self.serialization.push(':');
422        match scheme_type {
423            SchemeType::File => {
424                self.log_violation_if(ExpectedFileDoubleSlash, || !input.starts_with("//"));
425                let base_file_url = self.base_url.and_then(|base| {
426                    if base.scheme() == "file" {
427                        Some(base)
428                    } else {
429                        None
430                    }
431                });
432                self.serialization.clear();
433                self.parse_file(input, scheme_type, base_file_url)
434            }
435            SchemeType::SpecialNotFile => {
436                // special relative or authority state
437                let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
438                if let Some(base_url) = self.base_url {
439                    if slashes_count < 2
440                        && base_url.scheme() == &self.serialization[..scheme_end as usize]
441                    {
442                        // "Cannot-be-a-base" URLs only happen with "not special" schemes.
443                        debug_assert!(!base_url.cannot_be_a_base());
444                        self.serialization.clear();
445                        return self.parse_relative(input, scheme_type, base_url);
446                    }
447                }
448                // special authority slashes state
449                self.log_violation_if(ExpectedDoubleSlash, || {
450                    input
451                        .clone()
452                        .take_while(|&c| matches!(c, '/' | '\\'))
453                        .collect::<String>()
454                        != "//"
455                });
456                self.after_double_slash(remaining, scheme_type, scheme_end)
457            }
458            SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end),
459        }
460    }
461
462    /// Scheme other than file, http, https, ws, ws, ftp.
463    fn parse_non_special(
464        mut self,
465        input: Input<'_>,
466        scheme_type: SchemeType,
467        scheme_end: u32,
468    ) -> ParseResult<Url> {
469        // path or authority state (
470        if let Some(input) = input.split_prefix("//") {
471            return self.after_double_slash(input, scheme_type, scheme_end);
472        }
473        // Anarchist URL (no authority)
474        let path_start = to_u32(self.serialization.len())?;
475        let username_end = path_start;
476        let host_start = path_start;
477        let host_end = path_start;
478        let host = HostInternal::None;
479        let port = None;
480        let remaining = if let Some(input) = input.split_prefix('/') {
481            self.serialization.push('/');
482            self.parse_path(scheme_type, &mut false, path_start as usize, input)
483        } else {
484            self.parse_cannot_be_a_base_path(input)
485        };
486        self.with_query_and_fragment(
487            scheme_type,
488            scheme_end,
489            username_end,
490            host_start,
491            host_end,
492            host,
493            port,
494            path_start,
495            remaining,
496        )
497    }
498
499    fn parse_file(
500        mut self,
501        input: Input<'_>,
502        scheme_type: SchemeType,
503        base_file_url: Option<&Url>,
504    ) -> ParseResult<Url> {
505        use crate::SyntaxViolation::Backslash;
506        // file state
507        debug_assert!(self.serialization.is_empty());
508        let (first_char, input_after_first_char) = input.split_first();
509        if matches!(first_char, Some('/') | Some('\\')) {
510            self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\'));
511            // file slash state
512            let (next_char, input_after_next_char) = input_after_first_char.split_first();
513            if matches!(next_char, Some('/') | Some('\\')) {
514                self.log_violation_if(Backslash, || next_char == Some('\\'));
515                // file host state
516                self.serialization.push_str("file://");
517                let scheme_end = "file".len() as u32;
518                let host_start = "file://".len() as u32;
519                let (path_start, mut host, remaining) =
520                    self.parse_file_host(input_after_next_char)?;
521                let mut host_end = to_u32(self.serialization.len())?;
522                let mut has_host = !matches!(host, HostInternal::None);
523                let remaining = if path_start {
524                    self.parse_path_start(SchemeType::File, &mut has_host, remaining)
525                } else {
526                    let path_start = self.serialization.len();
527                    self.serialization.push('/');
528                    self.parse_path(SchemeType::File, &mut has_host, path_start, remaining)
529                };
530
531                // For file URLs that have a host and whose path starts
532                // with the windows drive letter we just remove the host.
533                if !has_host {
534                    self.serialization
535                        .drain(host_start as usize..host_end as usize);
536                    host_end = host_start;
537                    host = HostInternal::None;
538                }
539                let (query_start, fragment_start) =
540                    self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
541                return Ok(Url {
542                    serialization: self.serialization,
543                    scheme_end,
544                    username_end: host_start,
545                    host_start,
546                    host_end,
547                    host,
548                    port: None,
549                    path_start: host_end,
550                    query_start,
551                    fragment_start,
552                });
553            } else {
554                self.serialization.push_str("file://");
555                let scheme_end = "file".len() as u32;
556                let host_start = "file://".len();
557                let mut host_end = host_start;
558                let mut host = HostInternal::None;
559                if !starts_with_windows_drive_letter_segment(&input_after_first_char) {
560                    if let Some(base_url) = base_file_url {
561                        let first_segment = base_url.path_segments().unwrap().next().unwrap();
562                        if is_normalized_windows_drive_letter(first_segment) {
563                            self.serialization.push('/');
564                            self.serialization.push_str(first_segment);
565                        } else if let Some(host_str) = base_url.host_str() {
566                            self.serialization.push_str(host_str);
567                            host_end = self.serialization.len();
568                            host = base_url.host;
569                        }
570                    }
571                }
572                // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one
573                let parse_path_input = if let Some(c) = first_char {
574                    if c == '/' || c == '\\' || c == '?' || c == '#' {
575                        input
576                    } else {
577                        input_after_first_char
578                    }
579                } else {
580                    input_after_first_char
581                };
582
583                let remaining =
584                    self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input);
585
586                let host_start = host_start as u32;
587
588                let (query_start, fragment_start) =
589                    self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
590
591                let host_end = host_end as u32;
592                return Ok(Url {
593                    serialization: self.serialization,
594                    scheme_end,
595                    username_end: host_start,
596                    host_start,
597                    host_end,
598                    host,
599                    port: None,
600                    path_start: host_end,
601                    query_start,
602                    fragment_start,
603                });
604            }
605        }
606        if let Some(base_url) = base_file_url {
607            match first_char {
608                None => {
609                    // Copy everything except the fragment
610                    let before_fragment = match base_url.fragment_start {
611                        Some(i) => &base_url.serialization[..i as usize],
612                        None => &*base_url.serialization,
613                    };
614                    self.serialization.push_str(before_fragment);
615                    Ok(Url {
616                        serialization: self.serialization,
617                        fragment_start: None,
618                        ..*base_url
619                    })
620                }
621                Some('?') => {
622                    // Copy everything up to the query string
623                    let before_query = match (base_url.query_start, base_url.fragment_start) {
624                        (None, None) => &*base_url.serialization,
625                        (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
626                    };
627                    self.serialization.push_str(before_query);
628                    let (query_start, fragment_start) =
629                        self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
630                    Ok(Url {
631                        serialization: self.serialization,
632                        query_start,
633                        fragment_start,
634                        ..*base_url
635                    })
636                }
637                Some('#') => self.fragment_only(base_url, input),
638                _ => {
639                    if !starts_with_windows_drive_letter_segment(&input) {
640                        let before_query = match (base_url.query_start, base_url.fragment_start) {
641                            (None, None) => &*base_url.serialization,
642                            (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
643                        };
644                        self.serialization.push_str(before_query);
645                        self.shorten_path(SchemeType::File, base_url.path_start as usize);
646                        let remaining = self.parse_path(
647                            SchemeType::File,
648                            &mut true,
649                            base_url.path_start as usize,
650                            input,
651                        );
652                        self.with_query_and_fragment(
653                            SchemeType::File,
654                            base_url.scheme_end,
655                            base_url.username_end,
656                            base_url.host_start,
657                            base_url.host_end,
658                            base_url.host,
659                            base_url.port,
660                            base_url.path_start,
661                            remaining,
662                        )
663                    } else {
664                        self.serialization.push_str("file:///");
665                        let scheme_end = "file".len() as u32;
666                        let path_start = "file://".len();
667                        let remaining =
668                            self.parse_path(SchemeType::File, &mut false, path_start, input);
669                        let (query_start, fragment_start) =
670                            self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
671                        let path_start = path_start as u32;
672                        Ok(Url {
673                            serialization: self.serialization,
674                            scheme_end,
675                            username_end: path_start,
676                            host_start: path_start,
677                            host_end: path_start,
678                            host: HostInternal::None,
679                            port: None,
680                            path_start,
681                            query_start,
682                            fragment_start,
683                        })
684                    }
685                }
686            }
687        } else {
688            self.serialization.push_str("file:///");
689            let scheme_end = "file".len() as u32;
690            let path_start = "file://".len();
691            let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input);
692            let (query_start, fragment_start) =
693                self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
694            let path_start = path_start as u32;
695            Ok(Url {
696                serialization: self.serialization,
697                scheme_end,
698                username_end: path_start,
699                host_start: path_start,
700                host_end: path_start,
701                host: HostInternal::None,
702                port: None,
703                path_start,
704                query_start,
705                fragment_start,
706            })
707        }
708    }
709
710    fn parse_relative(
711        mut self,
712        input: Input<'_>,
713        scheme_type: SchemeType,
714        base_url: &Url,
715    ) -> ParseResult<Url> {
716        // relative state
717        debug_assert!(self.serialization.is_empty());
718        let (first_char, input_after_first_char) = input.split_first();
719        match first_char {
720            None => {
721                // Copy everything except the fragment
722                let before_fragment = match base_url.fragment_start {
723                    Some(i) => &base_url.serialization[..i as usize],
724                    None => &*base_url.serialization,
725                };
726                self.serialization.push_str(before_fragment);
727                Ok(Url {
728                    serialization: self.serialization,
729                    fragment_start: None,
730                    ..*base_url
731                })
732            }
733            Some('?') => {
734                // Copy everything up to the query string
735                let before_query = match (base_url.query_start, base_url.fragment_start) {
736                    (None, None) => &*base_url.serialization,
737                    (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
738                };
739                self.serialization.push_str(before_query);
740                let (query_start, fragment_start) =
741                    self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
742                Ok(Url {
743                    serialization: self.serialization,
744                    query_start,
745                    fragment_start,
746                    ..*base_url
747                })
748            }
749            Some('#') => self.fragment_only(base_url, input),
750            Some('/') | Some('\\') => {
751                let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
752                if slashes_count >= 2 {
753                    self.log_violation_if(SyntaxViolation::ExpectedDoubleSlash, || {
754                        input
755                            .clone()
756                            .take_while(|&c| matches!(c, '/' | '\\'))
757                            .collect::<String>()
758                            != "//"
759                    });
760                    let scheme_end = base_url.scheme_end;
761                    debug_assert!(base_url.byte_at(scheme_end) == b':');
762                    self.serialization
763                        .push_str(base_url.slice(..scheme_end + 1));
764                    if let Some(after_prefix) = input.split_prefix("//") {
765                        return self.after_double_slash(after_prefix, scheme_type, scheme_end);
766                    }
767                    return self.after_double_slash(remaining, scheme_type, scheme_end);
768                }
769                let path_start = base_url.path_start;
770                self.serialization.push_str(base_url.slice(..path_start));
771                self.serialization.push('/');
772                let remaining = self.parse_path(
773                    scheme_type,
774                    &mut true,
775                    path_start as usize,
776                    input_after_first_char,
777                );
778                self.with_query_and_fragment(
779                    scheme_type,
780                    base_url.scheme_end,
781                    base_url.username_end,
782                    base_url.host_start,
783                    base_url.host_end,
784                    base_url.host,
785                    base_url.port,
786                    base_url.path_start,
787                    remaining,
788                )
789            }
790            _ => {
791                let before_query = match (base_url.query_start, base_url.fragment_start) {
792                    (None, None) => &*base_url.serialization,
793                    (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
794                };
795                self.serialization.push_str(before_query);
796                // FIXME spec says just "remove last entry", not the "pop" algorithm
797                self.pop_path(scheme_type, base_url.path_start as usize);
798                // A special url always has a path.
799                // A path always starts with '/'
800                if self.serialization.len() == base_url.path_start as usize
801                    && (SchemeType::from(base_url.scheme()).is_special() || !input.is_empty())
802                {
803                    self.serialization.push('/');
804                }
805                let remaining = match input.split_first() {
806                    (Some('/'), remaining) => self.parse_path(
807                        scheme_type,
808                        &mut true,
809                        base_url.path_start as usize,
810                        remaining,
811                    ),
812                    _ => {
813                        self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input)
814                    }
815                };
816                self.with_query_and_fragment(
817                    scheme_type,
818                    base_url.scheme_end,
819                    base_url.username_end,
820                    base_url.host_start,
821                    base_url.host_end,
822                    base_url.host,
823                    base_url.port,
824                    base_url.path_start,
825                    remaining,
826                )
827            }
828        }
829    }
830
831    fn after_double_slash(
832        mut self,
833        input: Input<'_>,
834        scheme_type: SchemeType,
835        scheme_end: u32,
836    ) -> ParseResult<Url> {
837        self.serialization.push('/');
838        self.serialization.push('/');
839        // authority state
840        let before_authority = self.serialization.len();
841        let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?;
842        let has_authority = before_authority != self.serialization.len();
843        // host state
844        let host_start = to_u32(self.serialization.len())?;
845        let (host_end, host, port, remaining) =
846            self.parse_host_and_port(remaining, scheme_end, scheme_type)?;
847        if host == HostInternal::None && has_authority {
848            return Err(ParseError::EmptyHost);
849        }
850        // path state
851        let path_start = to_u32(self.serialization.len())?;
852        let remaining = self.parse_path_start(scheme_type, &mut true, remaining);
853        self.with_query_and_fragment(
854            scheme_type,
855            scheme_end,
856            username_end,
857            host_start,
858            host_end,
859            host,
860            port,
861            path_start,
862            remaining,
863        )
864    }
865
866    /// Return (username_end, remaining)
867    fn parse_userinfo<'i>(
868        &mut self,
869        mut input: Input<'i>,
870        scheme_type: SchemeType,
871    ) -> ParseResult<(u32, Input<'i>)> {
872        let mut last_at = None;
873        let mut remaining = input.clone();
874        let mut char_count = 0;
875        while let Some(c) = remaining.next() {
876            match c {
877                '@' => {
878                    if last_at.is_some() {
879                        self.log_violation(SyntaxViolation::UnencodedAtSign)
880                    } else {
881                        self.log_violation(SyntaxViolation::EmbeddedCredentials)
882                    }
883                    last_at = Some((char_count, remaining.clone()))
884                }
885                '/' | '?' | '#' => break,
886                '\\' if scheme_type.is_special() => break,
887                _ => (),
888            }
889            char_count += 1;
890        }
891        let (mut userinfo_char_count, remaining) = match last_at {
892            None => return Ok((to_u32(self.serialization.len())?, input)),
893            Some((0, remaining)) => {
894                // Otherwise, if one of the following is true
895                // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
896                // url is special and c is U+005C (\)
897                // If @ flag is set and buffer is the empty string, validation error, return failure.
898                if let (Some(c), _) = remaining.split_first() {
899                    if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') {
900                        return Err(ParseError::EmptyHost);
901                    }
902                }
903                return Ok((to_u32(self.serialization.len())?, remaining));
904            }
905            Some(x) => x,
906        };
907
908        let mut username_end = None;
909        let mut has_password = false;
910        let mut has_username = false;
911        while userinfo_char_count > 0 {
912            let (c, utf8_c) = input.next_utf8().unwrap();
913            userinfo_char_count -= 1;
914            if c == ':' && username_end.is_none() {
915                // Start parsing password
916                username_end = Some(to_u32(self.serialization.len())?);
917                // We don't add a colon if the password is empty
918                if userinfo_char_count > 0 {
919                    self.serialization.push(':');
920                    has_password = true;
921                }
922            } else {
923                if !has_password {
924                    has_username = true;
925                }
926                self.check_url_code_point(c, &input);
927                self.serialization
928                    .extend(utf8_percent_encode(utf8_c, USERINFO));
929            }
930        }
931        let username_end = match username_end {
932            Some(i) => i,
933            None => to_u32(self.serialization.len())?,
934        };
935        if has_username || has_password {
936            self.serialization.push('@');
937        }
938        Ok((username_end, remaining))
939    }
940
941    fn parse_host_and_port<'i>(
942        &mut self,
943        input: Input<'i>,
944        scheme_end: u32,
945        scheme_type: SchemeType,
946    ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)> {
947        let (host, remaining) = Parser::parse_host(input, scheme_type)?;
948        write!(&mut self.serialization, "{}", host).unwrap();
949        let host_end = to_u32(self.serialization.len())?;
950        if let Host::Domain(h) = &host {
951            if h.is_empty() {
952                // Port with an empty host
953                if remaining.starts_with(":") {
954                    return Err(ParseError::EmptyHost);
955                }
956                if scheme_type.is_special() {
957                    return Err(ParseError::EmptyHost);
958                }
959            }
960        };
961
962        let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') {
963            let scheme = || default_port(&self.serialization[..scheme_end as usize]);
964            Parser::parse_port(remaining, scheme, self.context)?
965        } else {
966            (None, remaining)
967        };
968        if let Some(port) = port {
969            write!(&mut self.serialization, ":{}", port).unwrap()
970        }
971        Ok((host_end, host.into(), port, remaining))
972    }
973
974    pub fn parse_host(
975        mut input: Input<'_>,
976        scheme_type: SchemeType,
977    ) -> ParseResult<(Host<String>, Input<'_>)> {
978        if scheme_type.is_file() {
979            return Parser::get_file_host(input);
980        }
981        // Undo the Input abstraction here to avoid allocating in the common case
982        // where the host part of the input does not contain any tab or newline
983        let input_str = input.chars.as_str();
984        let mut inside_square_brackets = false;
985        let mut has_ignored_chars = false;
986        let mut non_ignored_chars = 0;
987        let mut bytes = 0;
988        for c in input_str.chars() {
989            match c {
990                ':' if !inside_square_brackets => break,
991                '\\' if scheme_type.is_special() => break,
992                '/' | '?' | '#' => break,
993                '\t' | '\n' | '\r' => {
994                    has_ignored_chars = true;
995                }
996                '[' => {
997                    inside_square_brackets = true;
998                    non_ignored_chars += 1
999                }
1000                ']' => {
1001                    inside_square_brackets = false;
1002                    non_ignored_chars += 1
1003                }
1004                _ => non_ignored_chars += 1,
1005            }
1006            bytes += c.len_utf8();
1007        }
1008        let replaced: String;
1009        let host_str;
1010        {
1011            let host_input = input.by_ref().take(non_ignored_chars);
1012            if has_ignored_chars {
1013                replaced = host_input.collect();
1014                host_str = &*replaced
1015            } else {
1016                for _ in host_input {}
1017                host_str = &input_str[..bytes]
1018            }
1019        }
1020        if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() {
1021            return Err(ParseError::EmptyHost);
1022        }
1023        if !scheme_type.is_special() {
1024            let host = Host::parse_opaque(host_str)?;
1025            return Ok((host, input));
1026        }
1027        let host = Host::parse(host_str)?;
1028        Ok((host, input))
1029    }
1030
1031    fn get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)> {
1032        let (_, host_str, remaining) = Parser::file_host(input)?;
1033        let host = match Host::parse(&host_str)? {
1034            Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()),
1035            host => host,
1036        };
1037        Ok((host, remaining))
1038    }
1039
1040    fn parse_file_host<'i>(
1041        &mut self,
1042        input: Input<'i>,
1043    ) -> ParseResult<(bool, HostInternal, Input<'i>)> {
1044        let has_host;
1045        let (_, host_str, remaining) = Parser::file_host(input)?;
1046        let host = if host_str.is_empty() {
1047            has_host = false;
1048            HostInternal::None
1049        } else {
1050            match Host::parse(&host_str)? {
1051                Host::Domain(ref d) if d == "localhost" => {
1052                    has_host = false;
1053                    HostInternal::None
1054                }
1055                host => {
1056                    write!(&mut self.serialization, "{}", host).unwrap();
1057                    has_host = true;
1058                    host.into()
1059                }
1060            }
1061        };
1062        Ok((has_host, host, remaining))
1063    }
1064
1065    pub fn file_host(input: Input) -> ParseResult<(bool, String, Input)> {
1066        // Undo the Input abstraction here to avoid allocating in the common case
1067        // where the host part of the input does not contain any tab or newline
1068        let input_str = input.chars.as_str();
1069        let mut has_ignored_chars = false;
1070        let mut non_ignored_chars = 0;
1071        let mut bytes = 0;
1072        for c in input_str.chars() {
1073            match c {
1074                '/' | '\\' | '?' | '#' => break,
1075                '\t' | '\n' | '\r' => has_ignored_chars = true,
1076                _ => non_ignored_chars += 1,
1077            }
1078            bytes += c.len_utf8();
1079        }
1080        let replaced: String;
1081        let host_str;
1082        let mut remaining = input.clone();
1083        {
1084            let host_input = remaining.by_ref().take(non_ignored_chars);
1085            if has_ignored_chars {
1086                replaced = host_input.collect();
1087                host_str = &*replaced
1088            } else {
1089                for _ in host_input {}
1090                host_str = &input_str[..bytes]
1091            }
1092        }
1093        if is_windows_drive_letter(host_str) {
1094            return Ok((false, "".to_string(), input));
1095        }
1096        Ok((true, host_str.to_string(), remaining))
1097    }
1098
1099    pub fn parse_port<P>(
1100        mut input: Input<'_>,
1101        default_port: P,
1102        context: Context,
1103    ) -> ParseResult<(Option<u16>, Input<'_>)>
1104    where
1105        P: Fn() -> Option<u16>,
1106    {
1107        let mut port: u32 = 0;
1108        let mut has_any_digit = false;
1109        while let (Some(c), remaining) = input.split_first() {
1110            if let Some(digit) = c.to_digit(10) {
1111                port = port * 10 + digit;
1112                if port > u16::MAX as u32 {
1113                    return Err(ParseError::InvalidPort);
1114                }
1115                has_any_digit = true;
1116            } else if context == Context::UrlParser && !matches!(c, '/' | '\\' | '?' | '#') {
1117                return Err(ParseError::InvalidPort);
1118            } else {
1119                break;
1120            }
1121            input = remaining;
1122        }
1123        let mut opt_port = Some(port as u16);
1124        if !has_any_digit || opt_port == default_port() {
1125            opt_port = None;
1126        }
1127        Ok((opt_port, input))
1128    }
1129
1130    pub fn parse_path_start<'i>(
1131        &mut self,
1132        scheme_type: SchemeType,
1133        has_host: &mut bool,
1134        input: Input<'i>,
1135    ) -> Input<'i> {
1136        let path_start = self.serialization.len();
1137        let (maybe_c, remaining) = input.split_first();
1138        // If url is special, then:
1139        if scheme_type.is_special() {
1140            if maybe_c == Some('\\') {
1141                // If c is U+005C (\), validation error.
1142                self.log_violation(SyntaxViolation::Backslash);
1143            }
1144            // A special URL always has a non-empty path.
1145            if !self.serialization.ends_with('/') {
1146                self.serialization.push('/');
1147                // We have already made sure the forward slash is present.
1148                if maybe_c == Some('/') || maybe_c == Some('\\') {
1149                    return self.parse_path(scheme_type, has_host, path_start, remaining);
1150                }
1151            }
1152            return self.parse_path(scheme_type, has_host, path_start, input);
1153        } else if maybe_c == Some('?') || maybe_c == Some('#') {
1154            // Otherwise, if state override is not given and c is U+003F (?),
1155            // set url’s query to the empty string and state to query state.
1156            // Otherwise, if state override is not given and c is U+0023 (#),
1157            // set url’s fragment to the empty string and state to fragment state.
1158            // The query and path states will be handled by the caller.
1159            return input;
1160        }
1161
1162        if maybe_c.is_some() && maybe_c != Some('/') {
1163            self.serialization.push('/');
1164        }
1165        // Otherwise, if c is not the EOF code point:
1166        self.parse_path(scheme_type, has_host, path_start, input)
1167    }
1168
1169    pub fn parse_path<'i>(
1170        &mut self,
1171        scheme_type: SchemeType,
1172        has_host: &mut bool,
1173        path_start: usize,
1174        mut input: Input<'i>,
1175    ) -> Input<'i> {
1176        // Relative path state
1177        loop {
1178            let mut segment_start = self.serialization.len();
1179            let mut ends_with_slash = false;
1180            loop {
1181                let input_before_c = input.clone();
1182                let (c, utf8_c) = if let Some(x) = input.next_utf8() {
1183                    x
1184                } else {
1185                    break;
1186                };
1187                match c {
1188                    '/' if self.context != Context::PathSegmentSetter => {
1189                        self.serialization.push(c);
1190                        ends_with_slash = true;
1191                        break;
1192                    }
1193                    '\\' if self.context != Context::PathSegmentSetter
1194                        && scheme_type.is_special() =>
1195                    {
1196                        self.log_violation(SyntaxViolation::Backslash);
1197                        self.serialization.push('/');
1198                        ends_with_slash = true;
1199                        break;
1200                    }
1201                    '?' | '#' if self.context == Context::UrlParser => {
1202                        input = input_before_c;
1203                        break;
1204                    }
1205                    _ => {
1206                        self.check_url_code_point(c, &input);
1207                        if scheme_type.is_file()
1208                            && self.serialization.len() > path_start
1209                            && is_normalized_windows_drive_letter(
1210                                &self.serialization[path_start + 1..],
1211                            )
1212                        {
1213                            self.serialization.push('/');
1214                            segment_start += 1;
1215                        }
1216                        if self.context == Context::PathSegmentSetter {
1217                            if scheme_type.is_special() {
1218                                self.serialization
1219                                    .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT));
1220                            } else {
1221                                self.serialization
1222                                    .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT));
1223                            }
1224                        } else {
1225                            self.serialization.extend(utf8_percent_encode(utf8_c, PATH));
1226                        }
1227                    }
1228                }
1229            }
1230            let segment_before_slash = if ends_with_slash {
1231                &self.serialization[segment_start..self.serialization.len() - 1]
1232            } else {
1233                &self.serialization[segment_start..self.serialization.len()]
1234            };
1235            match segment_before_slash {
1236                // If buffer is a double-dot path segment, shorten url’s path,
1237                ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e"
1238                | ".%2E" => {
1239                    debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/');
1240                    self.serialization.truncate(segment_start);
1241                    if self.serialization.ends_with('/')
1242                        && Parser::last_slash_can_be_removed(&self.serialization, path_start)
1243                    {
1244                        self.serialization.pop();
1245                    }
1246                    self.shorten_path(scheme_type, path_start);
1247
1248                    // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1249                    if ends_with_slash && !self.serialization.ends_with('/') {
1250                        self.serialization.push('/');
1251                    }
1252                }
1253                // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/),
1254                // nor url is special and c is U+005C (\), append the empty string to url’s path.
1255                "." | "%2e" | "%2E" => {
1256                    self.serialization.truncate(segment_start);
1257                    if !self.serialization.ends_with('/') {
1258                        self.serialization.push('/');
1259                    }
1260                }
1261                _ => {
1262                    // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then
1263                    if scheme_type.is_file()
1264                        && segment_start == path_start + 1
1265                        && is_windows_drive_letter(segment_before_slash)
1266                    {
1267                        // Replace the second code point in buffer with U+003A (:).
1268                        if let Some(c) = segment_before_slash.chars().next() {
1269                            self.serialization.truncate(segment_start);
1270                            self.serialization.push(c);
1271                            self.serialization.push(':');
1272                            if ends_with_slash {
1273                                self.serialization.push('/');
1274                            }
1275                        }
1276                        // If url’s host is neither the empty string nor null,
1277                        // validation error, set url’s host to the empty string.
1278                        if *has_host {
1279                            self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive);
1280                            *has_host = false; // FIXME account for this in callers
1281                        }
1282                    }
1283                }
1284            }
1285            if !ends_with_slash {
1286                break;
1287            }
1288        }
1289        if scheme_type.is_file() {
1290            // while url’s path’s size is greater than 1
1291            // and url’s path[0] is the empty string,
1292            // validation error, remove the first item from url’s path.
1293            //FIXME: log violation
1294            let path = self.serialization.split_off(path_start);
1295            self.serialization.push('/');
1296            self.serialization.push_str(path.trim_start_matches('/'));
1297        }
1298
1299        input
1300    }
1301
1302    fn last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool {
1303        let url_before_segment = &serialization[..serialization.len() - 1];
1304        if let Some(segment_before_start) = url_before_segment.rfind('/') {
1305            // Do not remove the root slash
1306            segment_before_start >= path_start
1307                // Or a windows drive letter slash
1308                && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..])
1309        } else {
1310            false
1311        }
1312    }
1313
1314    /// https://url.spec.whatwg.org/#shorten-a-urls-path
1315    fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1316        // If path is empty, then return.
1317        if self.serialization.len() == path_start {
1318            return;
1319        }
1320        // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
1321        if scheme_type.is_file()
1322            && is_normalized_windows_drive_letter(&self.serialization[path_start..])
1323        {
1324            return;
1325        }
1326        // Remove path’s last item.
1327        self.pop_path(scheme_type, path_start);
1328    }
1329
1330    /// https://url.spec.whatwg.org/#pop-a-urls-path
1331    fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1332        if self.serialization.len() > path_start {
1333            let slash_position = self.serialization[path_start..].rfind('/').unwrap();
1334            // + 1 since rfind returns the position before the slash.
1335            let segment_start = path_start + slash_position + 1;
1336            // Don’t pop a Windows drive letter
1337            if !(scheme_type.is_file()
1338                && is_normalized_windows_drive_letter(&self.serialization[segment_start..]))
1339            {
1340                self.serialization.truncate(segment_start);
1341            }
1342        }
1343    }
1344
1345    pub fn parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i> {
1346        loop {
1347            let input_before_c = input.clone();
1348            match input.next_utf8() {
1349                Some(('?', _)) | Some(('#', _)) if self.context == Context::UrlParser => {
1350                    return input_before_c
1351                }
1352                Some((c, utf8_c)) => {
1353                    self.check_url_code_point(c, &input);
1354                    self.serialization
1355                        .extend(utf8_percent_encode(utf8_c, CONTROLS));
1356                }
1357                None => return input,
1358            }
1359        }
1360    }
1361
1362    #[allow(clippy::too_many_arguments)]
1363    fn with_query_and_fragment(
1364        mut self,
1365        scheme_type: SchemeType,
1366        scheme_end: u32,
1367        username_end: u32,
1368        host_start: u32,
1369        host_end: u32,
1370        host: HostInternal,
1371        port: Option<u16>,
1372        mut path_start: u32,
1373        remaining: Input<'_>,
1374    ) -> ParseResult<Url> {
1375        // Special case for anarchist URL's with a leading empty path segment
1376        // This prevents web+demo:/.//not-a-host/ or web+demo:/path/..//not-a-host/,
1377        // when parsed and then serialized, from ending up as web+demo://not-a-host/
1378        // (they end up as web+demo:/.//not-a-host/).
1379        //
1380        // If url’s host is null, url does not have an opaque path,
1381        // url’s path’s size is greater than 1, and url’s path[0] is the empty string,
1382        // then append U+002F (/) followed by U+002E (.) to output.
1383        let scheme_end_as_usize = scheme_end as usize;
1384        let path_start_as_usize = path_start as usize;
1385        if path_start_as_usize == scheme_end_as_usize + 1 {
1386            // Anarchist URL
1387            if self.serialization[path_start_as_usize..].starts_with("//") {
1388                // Case 1: The base URL did not have an empty path segment, but the resulting one does
1389                // Insert the "/." prefix
1390                self.serialization.insert_str(path_start_as_usize, "/.");
1391                path_start += 2;
1392            }
1393            assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1394        } else if path_start_as_usize == scheme_end_as_usize + 3
1395            && &self.serialization[scheme_end_as_usize..path_start_as_usize] == ":/."
1396        {
1397            // Anarchist URL with leading empty path segment
1398            // The base URL has a "/." between the host and the path
1399            assert_eq!(self.serialization.as_bytes()[path_start_as_usize], b'/');
1400            if self
1401                .serialization
1402                .as_bytes()
1403                .get(path_start_as_usize + 1)
1404                .copied()
1405                != Some(b'/')
1406            {
1407                // Case 2: The base URL had an empty path segment, but the resulting one does not
1408                // Remove the "/." prefix
1409                self.serialization
1410                    .replace_range(scheme_end_as_usize..path_start_as_usize, ":");
1411                path_start -= 2;
1412            }
1413            assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1414        }
1415
1416        let (query_start, fragment_start) =
1417            self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
1418        Ok(Url {
1419            serialization: self.serialization,
1420            scheme_end,
1421            username_end,
1422            host_start,
1423            host_end,
1424            host,
1425            port,
1426            path_start,
1427            query_start,
1428            fragment_start,
1429        })
1430    }
1431
1432    /// Return (query_start, fragment_start)
1433    fn parse_query_and_fragment(
1434        &mut self,
1435        scheme_type: SchemeType,
1436        scheme_end: u32,
1437        mut input: Input<'_>,
1438    ) -> ParseResult<(Option<u32>, Option<u32>)> {
1439        let mut query_start = None;
1440        match input.next() {
1441            Some('#') => {}
1442            Some('?') => {
1443                query_start = Some(to_u32(self.serialization.len())?);
1444                self.serialization.push('?');
1445                let remaining = self.parse_query(scheme_type, scheme_end, input);
1446                if let Some(remaining) = remaining {
1447                    input = remaining
1448                } else {
1449                    return Ok((query_start, None));
1450                }
1451            }
1452            None => return Ok((None, None)),
1453            _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"),
1454        }
1455
1456        let fragment_start = to_u32(self.serialization.len())?;
1457        self.serialization.push('#');
1458        self.parse_fragment(input);
1459        Ok((query_start, Some(fragment_start)))
1460    }
1461
1462    pub fn parse_query<'i>(
1463        &mut self,
1464        scheme_type: SchemeType,
1465        scheme_end: u32,
1466        mut input: Input<'i>,
1467    ) -> Option<Input<'i>> {
1468        let len = input.chars.as_str().len();
1469        let mut query = String::with_capacity(len); // FIXME: use a streaming decoder instead
1470        let mut remaining = None;
1471        while let Some(c) = input.next() {
1472            if c == '#' && self.context == Context::UrlParser {
1473                remaining = Some(input);
1474                break;
1475            } else {
1476                self.check_url_code_point(c, &input);
1477                query.push(c);
1478            }
1479        }
1480
1481        let encoding = match &self.serialization[..scheme_end as usize] {
1482            "http" | "https" | "file" | "ftp" => self.query_encoding_override,
1483            _ => None,
1484        };
1485        let query_bytes = if let Some(o) = encoding {
1486            o(&query)
1487        } else {
1488            query.as_bytes().into()
1489        };
1490        let set = if scheme_type.is_special() {
1491            SPECIAL_QUERY
1492        } else {
1493            QUERY
1494        };
1495        self.serialization.extend(percent_encode(&query_bytes, set));
1496        remaining
1497    }
1498
1499    fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
1500        let before_fragment = match base_url.fragment_start {
1501            Some(i) => base_url.slice(..i),
1502            None => &*base_url.serialization,
1503        };
1504        debug_assert!(self.serialization.is_empty());
1505        self.serialization
1506            .reserve(before_fragment.len() + input.chars.as_str().len());
1507        self.serialization.push_str(before_fragment);
1508        self.serialization.push('#');
1509        let next = input.next();
1510        debug_assert!(next == Some('#'));
1511        self.parse_fragment(input);
1512        Ok(Url {
1513            serialization: self.serialization,
1514            fragment_start: Some(to_u32(before_fragment.len())?),
1515            ..*base_url
1516        })
1517    }
1518
1519    pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1520        while let Some((c, utf8_c)) = input.next_utf8() {
1521            if c == '\0' {
1522                self.log_violation(SyntaxViolation::NullInFragment)
1523            } else {
1524                self.check_url_code_point(c, &input);
1525            }
1526            self.serialization
1527                .extend(utf8_percent_encode(utf8_c, FRAGMENT));
1528        }
1529    }
1530
1531    fn check_url_code_point(&self, c: char, input: &Input<'_>) {
1532        if let Some(vfn) = self.violation_fn {
1533            if c == '%' {
1534                let mut input = input.clone();
1535                if !matches!((input.next(), input.next()), (Some(a), Some(b))
1536                             if a.is_ascii_hexdigit() && b.is_ascii_hexdigit())
1537                {
1538                    vfn(SyntaxViolation::PercentDecode)
1539                }
1540            } else if !is_url_code_point(c) {
1541                vfn(SyntaxViolation::NonUrlCodePoint)
1542            }
1543        }
1544    }
1545}
1546
1547// Non URL code points:
1548// U+0000 to U+0020 (space)
1549// " # % < > [ \ ] ^ ` { | }
1550// U+007F to U+009F
1551// surrogates
1552// U+FDD0 to U+FDEF
1553// Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex
1554#[inline]
1555fn is_url_code_point(c: char) -> bool {
1556    matches!(c,
1557        'a'..='z' |
1558        'A'..='Z' |
1559        '0'..='9' |
1560        '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
1561        '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' |
1562        '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
1563        '\u{10000}'..='\u{1FFFD}' | '\u{20000}'..='\u{2FFFD}' |
1564        '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' |
1565        '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' |
1566        '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' |
1567        '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' |
1568        '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' |
1569        '\u{D0000}'..='\u{DFFFD}' | '\u{E1000}'..='\u{EFFFD}' |
1570        '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1571}
1572
1573/// https://url.spec.whatwg.org/#c0-controls-and-space
1574#[inline]
1575fn c0_control_or_space(ch: char) -> bool {
1576    ch <= ' ' // U+0000 to U+0020
1577}
1578
1579/// https://infra.spec.whatwg.org/#ascii-tab-or-newline
1580#[inline]
1581fn ascii_tab_or_new_line(ch: char) -> bool {
1582    matches!(ch, '\t' | '\r' | '\n')
1583}
1584
1585/// https://url.spec.whatwg.org/#ascii-alpha
1586#[inline]
1587pub fn ascii_alpha(ch: char) -> bool {
1588    ch.is_ascii_alphabetic()
1589}
1590
1591#[inline]
1592pub fn to_u32(i: usize) -> ParseResult<u32> {
1593    if i <= u32::MAX as usize {
1594        Ok(i as u32)
1595    } else {
1596        Err(ParseError::Overflow)
1597    }
1598}
1599
1600fn is_normalized_windows_drive_letter(segment: &str) -> bool {
1601    is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':'
1602}
1603
1604/// Whether the scheme is file:, the path has a single segment, and that segment
1605/// is a Windows drive letter
1606#[inline]
1607pub fn is_windows_drive_letter(segment: &str) -> bool {
1608    segment.len() == 2 && starts_with_windows_drive_letter(segment)
1609}
1610
1611/// Whether path starts with a root slash
1612/// and a windows drive letter eg: "/c:" or "/a:/"
1613fn path_starts_with_windows_drive_letter(s: &str) -> bool {
1614    if let Some(c) = s.as_bytes().first() {
1615        matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..])
1616    } else {
1617        false
1618    }
1619}
1620
1621fn starts_with_windows_drive_letter(s: &str) -> bool {
1622    s.len() >= 2
1623        && ascii_alpha(s.as_bytes()[0] as char)
1624        && matches!(s.as_bytes()[1], b':' | b'|')
1625        && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#'))
1626}
1627
1628/// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
1629fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool {
1630    let mut input = input.clone();
1631    match (input.next(), input.next(), input.next()) {
1632        // its first two code points are a Windows drive letter
1633        // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
1634        (Some(a), Some(b), Some(c))
1635            if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') =>
1636        {
1637            true
1638        }
1639        // its first two code points are a Windows drive letter
1640        // its length is 2
1641        (Some(a), Some(b), None) if ascii_alpha(a) && matches!(b, ':' | '|') => true,
1642        _ => false,
1643    }
1644}