diff --git a/.gitignore b/.gitignore index 212de44..bff4e69 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /target -.DS_Store \ No newline at end of file +.DS_Store +example.html \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index b7cb0a3..0c5c498 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,7 +4,7 @@ version = 3 [[package]] name = "auto_encoder" -version = "0.1.5" +version = "0.1.6" dependencies = [ "chardetng", "encoding_rs", @@ -157,9 +157,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.88" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" dependencies = [ "unicode-ident", ] @@ -196,9 +196,9 @@ checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" [[package]] name = "syn" -version = "2.0.82" +version = "2.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021" +checksum = "01680f5d178a369f817f43f3d399650272873a8e7588a7872f7e90edc71d60a3" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 0379bcc..f9fea61 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "auto_encoder" -version = "0.1.5" +version = "0.1.6" edition = "2021" description = "Auto encoding library" repository = "https://github.com/spider-rs/auto-encoder" diff --git a/src/lib.rs b/src/lib.rs index dc2f559..0d9f20d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -144,7 +144,7 @@ static ENCODINGS_BY_LOCALE: phf::Map<&'static str, &'static encoding_rs::Encodin "en-gb" => encoding_rs::WINDOWS_1252, // English (United Kingdom) "en-ie" => encoding_rs::WINDOWS_1252, // English (Ireland) "en-nz" => encoding_rs::WINDOWS_1252, // English (New Zealand) - "en-us" => encoding_rs::WINDOWS_1252, // English (United States) + "en-us" => encoding_rs::UTF_8, // English (United States) "es-ar" => encoding_rs::WINDOWS_1252, // Spanish (Argentina) "es-bo" => encoding_rs::WINDOWS_1252, // Spanish (Bolivia) "es-cl" => encoding_rs::WINDOWS_1252, // Spanish (Chile) @@ -206,6 +206,14 @@ static ENCODINGS_BY_LOCALE: phf::Map<&'static str, &'static encoding_rs::Encodin "zh-tw" => encoding_rs::BIG5, // Chinese (Taiwan) }; +/// Handle the html encoding found. +pub struct HtmlMetadata { + /// The HTML lang attribute. + pub lang: Option, + /// The html meta encoding. + pub encoding: Option, +} + /// Get encoding for the locale if found pub fn encoding_for_locale(locale: &str) -> Option<&'static encoding_rs::Encoding> { ENCODINGS_BY_LOCALE @@ -233,30 +241,188 @@ pub fn is_binary_file(content: &[u8]) -> bool { /// Detect the language of a HTML resource. This does nothing without the "encoding" flag enabled. pub fn detect_language(html_content: &[u8]) -> Option { + if !html_content.is_empty() { + let search_area_limit = html_content.len().min(1024); + let search_area = &html_content[..search_area_limit]; + if let Some(html_start) = find_subsequence(search_area, b"')?; + return Some(String::from_utf8(after_lang[..end].to_vec()).ok()?); + } + } + } + } + None +} + +/// Detect the encoding used in an HTML file. +pub fn detect_encoding(html_content: &[u8]) -> Option { + // Limit the search area for efficiency let search_area_limit = html_content.len().min(1024); let search_area = &html_content[..search_area_limit]; - if let Some(html_start) = find_subsequence(search_area, b" + if let Some(charset_start) = find_subsequence(meta_content, b"charset=") { + let after_charset = &meta_content[charset_start + 8..]; + if let Some((quote, remaining)) = after_charset.split_first() { + if *quote == b'"' || *quote == b'\'' { + if let Some(quote_close) = find_subsequence(&remaining, &[*quote]) { + let charset_bytes = &remaining[..quote_close]; + if let Ok(charset) = String::from_utf8(charset_bytes.to_vec()) { + return Some(charset); + } + } + } + } + } - if let Some(lang_start) = find_subsequence(rest, b"lang=") { - let after_lang = &rest[lang_start + 5..]; - let quote = *after_lang.get(0)?; + // Case 2: + if let Some(http_equiv_start) = + find_subsequence(meta_content, b"http-equiv=\"Content-Type\"") + { + let content_start_idx = http_equiv_start + b"http-equiv=\"Content-Type\"".len(); + if let Some(content_start) = + find_subsequence(&meta_content[content_start_idx..], b"content=") + { + let after_content = &meta_content[content_start_idx + content_start + 8..]; + if let Some((quote, remaining)) = after_content.split_first() { + if *quote == b'"' || *quote == b'\'' { + let content_end = find_subsequence(&remaining, &[*quote])?; + let full_content = &remaining[..content_end]; + if let Some(charset_pos) = find_subsequence(full_content, b"charset=") { + let after_charset = &full_content[charset_pos + 8..]; + let charset_end = after_charset + .iter() + .position(|&c| c == b';' || c.is_ascii_whitespace()) + .unwrap_or(after_charset.len()); + if let Ok(charset) = + String::from_utf8(after_charset[..charset_end].to_vec()) + { + return Some(charset); + } + } + } + } + } + } + } else { + break; + } + } + + None +} - if quote == b'"' || quote == b'\'' { - if let Some(quote_close) = find_subsequence(&after_lang[1..], &[quote]) { - return Some(String::from_utf8(after_lang[1..quote_close + 1].to_vec()).ok()?); +/// Detect the html metadata to process the element based on the encoding or language found. +pub fn detect_html_metadata(html_content: &[u8]) -> Option { + let mut lang: Option = None; + let mut encoding: Option = None; + + if !html_content.is_empty() { + let search_area_limit = html_content.len().min(1024); + let search_area = &html_content[..search_area_limit]; + + // Detect language + if let Some(html_start) = find_subsequence(search_area, b"') + .unwrap_or(after_lang.len()); + lang = Some(String::from_utf8(after_lang[..end].to_vec()).ok()?); + } + } + } + + // Detect encoding + let mut pos = 0; + while pos < search_area.len() { + if let Some(meta_start) = find_subsequence(&search_area[pos..], b"Test"; assert!(detect_language(html_content).is_none()); } + + #[ignore] + #[test] + fn test_detect_encoding() { + use maud::{html, DOCTYPE}; + let markup = html! { + (DOCTYPE) + meta charset="utf-8"; + } + .into_string(); + assert!( + detect_encoding(&markup.as_bytes()) + .unwrap_or_default() + .to_lowercase() + == "utf-8" + ); + } }