Skip to content

Commit

Permalink
Bugfix fn get_encoding_from_content()
Browse files Browse the repository at this point in the history
  • Loading branch information
deedy5 committed Jul 25, 2024
1 parent 0b80d7f commit cdd8ce9
Showing 1 changed file with 18 additions and 5 deletions.
23 changes: 18 additions & 5 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,24 @@ pub fn get_encoding_from_headers(
/// Get encoding from the `<meta charset="...">` tag within the first 2048 bytes of HTML content.
pub fn get_encoding_from_content(raw_bytes: &[u8]) -> Option<String> {
let start_sequence = b"charset=";
let end_sequence = b'"';
let start_sequence_len = start_sequence.len();
let end_sequence = b'>';
let max_index = min(2048, raw_bytes.len());

let start_index = raw_bytes[..max_index]
.windows(start_sequence.len())
.windows(start_sequence_len)
.position(|window| window == start_sequence);

if let Some(start_index) = start_index {
let end_index = raw_bytes[start_index..max_index]
let end_index = &raw_bytes[start_index..max_index]
.iter()
.position(|&byte| byte == end_sequence)?;

let charset_slice = &raw_bytes[start_index + start_sequence.len()..start_index + end_index];
Some(String::from_utf8_lossy(charset_slice).into_owned())
let charset_slice = &raw_bytes[start_index + start_sequence_len..start_index + end_index];
let charset = String::from_utf8_lossy(charset_slice)
.trim_matches('"')
.to_string();
Some(charset)
} else {
None
}
Expand Down Expand Up @@ -122,6 +126,15 @@ mod utils_tests {
);
}

#[test]
fn test_get_encoding_from_content_present_charset2() {
let raw_html = b"<html><head><meta charset=\"windows1251\"></head></html>";
assert_eq!(
get_encoding_from_content(raw_html),
Some("windows1251".into())
);
}

#[test]
fn test_get_encoding_from_content_missing_charset() {
let raw_html = b"<html><head></head></html>";
Expand Down

0 comments on commit cdd8ce9

Please sign in to comment.