diff --git a/src/utils.rs b/src/utils.rs index 01178e8..7ea0f2a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -39,20 +39,24 @@ pub fn get_encoding_from_headers( /// Get encoding from the `` tag within the first 2048 bytes of HTML content. pub fn get_encoding_from_content(raw_bytes: &[u8]) -> Option { let start_sequence = b"charset="; - let end_sequence = b'"'; + let start_sequence_len = start_sequence.len(); + let end_sequence = b'>'; let max_index = min(2048, raw_bytes.len()); let start_index = raw_bytes[..max_index] - .windows(start_sequence.len()) + .windows(start_sequence_len) .position(|window| window == start_sequence); if let Some(start_index) = start_index { - let end_index = raw_bytes[start_index..max_index] + let end_index = &raw_bytes[start_index..max_index] .iter() .position(|&byte| byte == end_sequence)?; - let charset_slice = &raw_bytes[start_index + start_sequence.len()..start_index + end_index]; - Some(String::from_utf8_lossy(charset_slice).into_owned()) + let charset_slice = &raw_bytes[start_index + start_sequence_len..start_index + end_index]; + let charset = String::from_utf8_lossy(charset_slice) + .trim_matches('"') + .to_string(); + Some(charset) } else { None } @@ -122,6 +126,15 @@ mod utils_tests { ); } + #[test] + fn test_get_encoding_from_content_present_charset2() { + let raw_html = b""; + assert_eq!( + get_encoding_from_content(raw_html), + Some("windows1251".into()) + ); + } + #[test] fn test_get_encoding_from_content_missing_charset() { let raw_html = b"";