From 3562f2c9aacb4b7cba8a61d95f4d325b6b5887c1 Mon Sep 17 00:00:00 2001 From: Sander van Harmelen Date: Mon, 28 Jan 2019 19:28:51 +0100 Subject: [PATCH] Trim unicode characters from the start of URLs --- CHANGELOG.md | 1 + src/url.rs | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a1d8156..2bdd8c68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Resolved off-by-one issue with erasing characters in the last column - Excessive polling every 100ms with `live_config_reload` enabled +- Unicode characters at the beginning of URLs are now properly ignored ## Version 0.2.7 diff --git a/src/url.rs b/src/url.rs index fc2a00f9..0db083be 100644 --- a/src/url.rs +++ b/src/url.rs @@ -45,13 +45,19 @@ impl UrlParser { /// Returns the URL if the parser has found any. pub fn url(mut self) -> Option { - // Remove non-alphabetical characters before scheme + // Remove non-alphabetical characters before the scheme + // https://tools.ietf.org/html/rfc3986#section-3.1 if let Some(index) = self.state.find("://") { - for i in (0..index - 1).rev() { - match self.state.chars().nth(i).unwrap() { + let iter = self + .state + .char_indices() + .rev() + .skip_while(|(byte_index, _)| *byte_index >= index); + for (byte_index, c) in iter { + match c { 'a'...'z' | 'A'...'Z' => (), _ => { - self.state = self.state.split_off(i + 1); + self.state = self.state.split_off(byte_index + c.len_utf8()); break; } } @@ -206,6 +212,7 @@ mod tests { url_test("complicated:https://example.org", "https://example.org", 15); url_test("test.https://example.org", "https://example.org", 10); url_test(",https://example.org", "https://example.org", 5); + url_test("\u{2502}https://example.org", "https://example.org", 5); } #[test]