Skip to content

Commit

Permalink
chore(encoding): add duel lang and encoding handling
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 23, 2024
1 parent fe10935 commit 5c1f6f2
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 22 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/target
.DS_Store
.DS_Store
example.html
10 changes: 5 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "auto_encoder"
version = "0.1.5"
version = "0.1.6"
edition = "2021"
description = "Auto encoding library"
repository = "https://github.com/spider-rs/auto-encoder"
Expand Down
213 changes: 198 additions & 15 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ static ENCODINGS_BY_LOCALE: phf::Map<&'static str, &'static encoding_rs::Encodin
"en-gb" => encoding_rs::WINDOWS_1252, // English (United Kingdom)
"en-ie" => encoding_rs::WINDOWS_1252, // English (Ireland)
"en-nz" => encoding_rs::WINDOWS_1252, // English (New Zealand)
"en-us" => encoding_rs::WINDOWS_1252, // English (United States)
"en-us" => encoding_rs::UTF_8, // English (United States)
"es-ar" => encoding_rs::WINDOWS_1252, // Spanish (Argentina)
"es-bo" => encoding_rs::WINDOWS_1252, // Spanish (Bolivia)
"es-cl" => encoding_rs::WINDOWS_1252, // Spanish (Chile)
Expand Down Expand Up @@ -206,6 +206,14 @@ static ENCODINGS_BY_LOCALE: phf::Map<&'static str, &'static encoding_rs::Encodin
"zh-tw" => encoding_rs::BIG5, // Chinese (Taiwan)
};

/// Handle the html encoding found.
pub struct HtmlMetadata {
/// The HTML lang attribute.
pub lang: Option<String>,
/// The html meta encoding.
pub encoding: Option<String>,
}

/// Get encoding for the locale if found
pub fn encoding_for_locale(locale: &str) -> Option<&'static encoding_rs::Encoding> {
ENCODINGS_BY_LOCALE
Expand Down Expand Up @@ -233,30 +241,188 @@ pub fn is_binary_file(content: &[u8]) -> bool {

/// Detect the language of a HTML resource. This does nothing without the "encoding" flag enabled.
pub fn detect_language(html_content: &[u8]) -> Option<String> {
if !html_content.is_empty() {
let search_area_limit = html_content.len().min(1024);
let search_area = &html_content[..search_area_limit];
if let Some(html_start) = find_subsequence(search_area, b"<html") {
let rest = &search_area[html_start..];

if let Some(lang_start) = find_subsequence(rest, b"lang=") {
let after_lang = &rest[lang_start + 5..];
let quote = *after_lang.get(0)?;

if quote == b'"' || quote == b'\'' {
if let Some(quote_close) = find_subsequence(&after_lang[1..], &[quote]) {
return Some(
String::from_utf8(after_lang[1..quote_close + 1].to_vec()).ok()?,
);
}
} else {
let end = after_lang
.iter()
.position(|&c| c.is_ascii_whitespace() || c == b'>')?;
return Some(String::from_utf8(after_lang[..end].to_vec()).ok()?);
}
}
}
}
None
}

/// Detect the encoding used in an HTML file.
pub fn detect_encoding(html_content: &[u8]) -> Option<String> {
// Limit the search area for efficiency
let search_area_limit = html_content.len().min(1024);
let search_area = &html_content[..search_area_limit];

if let Some(html_start) = find_subsequence(search_area, b"<html") {
let rest = &search_area[html_start..];
let mut pos = 0;

while pos < search_area.len() {
if let Some(meta_start) = find_subsequence(&search_area[pos..], b"<meta") {
pos += meta_start;
let meta_content = &search_area[pos..];
pos += meta_content.len();

// Case 1: <meta charset="...">
if let Some(charset_start) = find_subsequence(meta_content, b"charset=") {
let after_charset = &meta_content[charset_start + 8..];
if let Some((quote, remaining)) = after_charset.split_first() {
if *quote == b'"' || *quote == b'\'' {
if let Some(quote_close) = find_subsequence(&remaining, &[*quote]) {
let charset_bytes = &remaining[..quote_close];
if let Ok(charset) = String::from_utf8(charset_bytes.to_vec()) {
return Some(charset);
}
}
}
}
}

if let Some(lang_start) = find_subsequence(rest, b"lang=") {
let after_lang = &rest[lang_start + 5..];
let quote = *after_lang.get(0)?;
// Case 2: <meta http-equiv="Content-Type" content="...; charset=...">
if let Some(http_equiv_start) =
find_subsequence(meta_content, b"http-equiv=\"Content-Type\"")
{
let content_start_idx = http_equiv_start + b"http-equiv=\"Content-Type\"".len();
if let Some(content_start) =
find_subsequence(&meta_content[content_start_idx..], b"content=")
{
let after_content = &meta_content[content_start_idx + content_start + 8..];
if let Some((quote, remaining)) = after_content.split_first() {
if *quote == b'"' || *quote == b'\'' {
let content_end = find_subsequence(&remaining, &[*quote])?;
let full_content = &remaining[..content_end];
if let Some(charset_pos) = find_subsequence(full_content, b"charset=") {
let after_charset = &full_content[charset_pos + 8..];
let charset_end = after_charset
.iter()
.position(|&c| c == b';' || c.is_ascii_whitespace())
.unwrap_or(after_charset.len());
if let Ok(charset) =
String::from_utf8(after_charset[..charset_end].to_vec())
{
return Some(charset);
}
}
}
}
}
}
} else {
break;
}
}

None
}

if quote == b'"' || quote == b'\'' {
if let Some(quote_close) = find_subsequence(&after_lang[1..], &[quote]) {
return Some(String::from_utf8(after_lang[1..quote_close + 1].to_vec()).ok()?);
/// Detect the html metadata to process the element based on the encoding or language found.
pub fn detect_html_metadata(html_content: &[u8]) -> Option<HtmlMetadata> {
let mut lang: Option<String> = None;
let mut encoding: Option<String> = None;

if !html_content.is_empty() {
let search_area_limit = html_content.len().min(1024);
let search_area = &html_content[..search_area_limit];

// Detect language
if let Some(html_start) = find_subsequence(search_area, b"<html") {
let rest = &search_area[html_start..];
if let Some(lang_start) = find_subsequence(rest, b"lang=") {
let after_lang = &rest[lang_start + 5..];
let quote = *after_lang.get(0).unwrap_or(&b' ');

if quote == b'"' || quote == b'\'' {
if let Some(quote_close) = find_subsequence(&after_lang[1..], &[quote]) {
lang =
Some(String::from_utf8(after_lang[1..quote_close + 1].to_vec()).ok()?);
}
} else {
let end = after_lang
.iter()
.position(|&c| c.is_ascii_whitespace() || c == b'>')
.unwrap_or(after_lang.len());
lang = Some(String::from_utf8(after_lang[..end].to_vec()).ok()?);
}
}
}

// Detect encoding
let mut pos = 0;
while pos < search_area.len() {
if let Some(meta_start) = find_subsequence(&search_area[pos..], b"<meta") {
pos += meta_start;
let meta_content = &search_area[pos..];
pos += meta_content.len();

if let Some(charset_start) = find_subsequence(meta_content, b"charset=") {
let after_charset = &meta_content[charset_start + 8..];
if let Some((quote, remaining)) = after_charset.split_first() {
if *quote == b'"' || *quote == b'\'' {
if let Some(quote_close) = find_subsequence(&remaining, &[*quote]) {
let charset_bytes = &remaining[..quote_close];
encoding = String::from_utf8(charset_bytes.to_vec()).ok();
break;
}
}
}
}

if let Some(http_equiv_start) =
find_subsequence(meta_content, b"http-equiv=\"Content-Type\"")
{
let content_start_idx = http_equiv_start + b"http-equiv=\"Content-Type\"".len();
if let Some(content_start) =
find_subsequence(&meta_content[content_start_idx..], b"content=")
{
let after_content = &meta_content[content_start_idx + content_start + 8..];
if let Some((quote, remaining)) = after_content.split_first() {
if *quote == b'"' || *quote == b'\'' {
let content_end = find_subsequence(&remaining, &[*quote])?;
let full_content = &remaining[..content_end];
if let Some(charset_pos) =
find_subsequence(full_content, b"charset=")
{
let after_charset = &full_content[charset_pos + 8..];
let charset_end = after_charset
.iter()
.position(|&c| c == b';' || c.is_ascii_whitespace())
.unwrap_or(after_charset.len());
encoding =
String::from_utf8(after_charset[..charset_end].to_vec())
.ok();
break;
}
}
}
}
}
} else {
let end = after_lang
.iter()
.position(|&c| c.is_ascii_whitespace() || c == b'>')?;
return Some(String::from_utf8(after_lang[..end].to_vec()).ok()?);
break;
}
}
}

None
Some(HtmlMetadata { lang, encoding })
}

/// Helper function to find a subsequence in a slice.
Expand Down Expand Up @@ -439,7 +605,7 @@ mod tests {
fn test_encoding_for_locale() {
assert_eq!(
encoding_for_locale("en-us"),
Some(encoding_rs::WINDOWS_1252)
Some(encoding_rs::UTF_8)
);
assert_eq!(encoding_for_locale("zh-cn"), Some(encoding_rs::GB18030));
assert_eq!(encoding_for_locale("ja-jp"), Some(encoding_rs::SHIFT_JIS));
Expand Down Expand Up @@ -514,4 +680,21 @@ mod tests {
let html_content = b"<html><head><title>Test</title></head><body></body></html>";
assert!(detect_language(html_content).is_none());
}

#[ignore]
#[test]
fn test_detect_encoding() {
use maud::{html, DOCTYPE};
let markup = html! {
(DOCTYPE)
meta charset="utf-8";
}
.into_string();
assert!(
detect_encoding(&markup.as_bytes())
.unwrap_or_default()
.to_lowercase()
== "utf-8"
);
}
}

0 comments on commit 5c1f6f2

Please sign in to comment.