From cdd8ce9d44f3461fa51f42f15c3b1a8d5a3df19a Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Thu, 25 Jul 2024 12:41:46 +0300
Subject: [PATCH] Bugfix fn get_encoding_from_content()
---
src/utils.rs | 23 ++++++++++++++++++-----
1 file changed, 18 insertions(+), 5 deletions(-)
diff --git a/src/utils.rs b/src/utils.rs
index 01178e8..7ea0f2a 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -39,20 +39,24 @@ pub fn get_encoding_from_headers(
/// Get encoding from the `` tag within the first 2048 bytes of HTML content.
pub fn get_encoding_from_content(raw_bytes: &[u8]) -> Option {
let start_sequence = b"charset=";
- let end_sequence = b'"';
+ let start_sequence_len = start_sequence.len();
+ let end_sequence = b'>';
let max_index = min(2048, raw_bytes.len());
let start_index = raw_bytes[..max_index]
- .windows(start_sequence.len())
+ .windows(start_sequence_len)
.position(|window| window == start_sequence);
if let Some(start_index) = start_index {
- let end_index = raw_bytes[start_index..max_index]
+ let end_index = &raw_bytes[start_index..max_index]
.iter()
.position(|&byte| byte == end_sequence)?;
- let charset_slice = &raw_bytes[start_index + start_sequence.len()..start_index + end_index];
- Some(String::from_utf8_lossy(charset_slice).into_owned())
+ let charset_slice = &raw_bytes[start_index + start_sequence_len..start_index + end_index];
+ let charset = String::from_utf8_lossy(charset_slice)
+ .trim_matches('"')
+ .to_string();
+ Some(charset)
} else {
None
}
@@ -122,6 +126,15 @@ mod utils_tests {
);
}
+ #[test]
+ fn test_get_encoding_from_content_present_charset2() {
+ let raw_html = b"";
+ assert_eq!(
+ get_encoding_from_content(raw_html),
+ Some("windows1251".into())
+ );
+ }
+
#[test]
fn test_get_encoding_from_content_missing_charset() {
let raw_html = b"";