diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index a6b8c8b8..58b9b59d 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -217,7 +217,7 @@ impl Document { /// [`Punctuation::Quote::twin_loc`] field. This is on a best effort /// basis. /// - /// Current algorithm is very basic and could use some work. + /// Current algorithm is basic and could use some work. fn match_quotes(&mut self) { let quote_indices: Vec = self.tokens.iter_quote_indices().collect(); diff --git a/harper-core/src/language_detection.rs b/harper-core/src/language_detection.rs index fb9a1509..cd9a5bac 100644 --- a/harper-core/src/language_detection.rs +++ b/harper-core/src/language_detection.rs @@ -10,6 +10,7 @@ pub fn is_likely_english(toks: &[Token], source: &[char], dict: &impl Dictionary let mut total_words = 0; let mut valid_words = 0; let mut punctuation = 0; + let mut unlintable = 0; for token in toks { match token.kind { @@ -22,10 +23,15 @@ pub fn is_likely_english(toks: &[Token], source: &[char], dict: &impl Dictionary } } TokenKind::Punctuation(_) => punctuation += 1, + TokenKind::Unlintable => unlintable += 1, _ => (), } } + if unlintable > valid_words { + return false; + } + if (punctuation as f32 * 1.25) > valid_words as f32 { return false; } diff --git a/harper-core/src/parsers/isolate_english.rs b/harper-core/src/parsers/isolate_english.rs new file mode 100644 index 00000000..b6091bb1 --- /dev/null +++ b/harper-core/src/parsers/isolate_english.rs @@ -0,0 +1,35 @@ +use crate::{language_detection::is_likely_english, Dictionary}; + +use super::{Parser, Token, TokenStringExt}; + +/// A parser that wraps another, using heuristics to quickly redact paragraphs of a document that aren't +/// intended to be English text. +pub struct IsolateEnglish { + inner: Box, + dict: D, +} + +impl IsolateEnglish { + pub fn new(inner: Box, dictionary: D) -> Self { + Self { + inner, + dict: dictionary, + } + } +} + +impl Parser for IsolateEnglish { + fn parse(&mut self, source: &[char]) -> Vec { + let tokens = self.inner.parse(source); + + let mut english_tokens: Vec = Vec::with_capacity(tokens.len()); + + for sentence in tokens.iter_sentences() { + if sentence.len() > 5 && is_likely_english(sentence, source, &self.dict) { + english_tokens.extend(sentence); + } + } + + english_tokens + } +} diff --git a/harper-core/src/parsers/mod.rs b/harper-core/src/parsers/mod.rs index 3f1efad2..f35f209b 100644 --- a/harper-core/src/parsers/mod.rs +++ b/harper-core/src/parsers/mod.rs @@ -1,10 +1,12 @@ mod collapse_identifiers; +mod isolate_english; mod markdown; mod mask; mod plain_english; use blanket::blanket; pub use collapse_identifiers::CollapseIdentifiers; +pub use isolate_english::IsolateEnglish; pub use markdown::Markdown; pub use mask::Mask; pub use plain_english::PlainEnglish; diff --git a/harper-wasm/src/lib.rs b/harper-wasm/src/lib.rs index ad3f6f34..53eeffc7 100644 --- a/harper-wasm/src/lib.rs +++ b/harper-wasm/src/lib.rs @@ -4,7 +4,7 @@ use std::sync::Mutex; use harper_core::language_detection::is_doc_likely_english; use harper_core::linting::{LintGroup, LintGroupConfig, Linter}; -use harper_core::parsers::Markdown; +use harper_core::parsers::{IsolateEnglish, Markdown, PlainEnglish}; use harper_core::{remove_overlaps, Document, FullDictionary, Lrc}; use once_cell::sync::Lazy; use wasm_bindgen::prelude::wasm_bindgen; @@ -28,13 +28,26 @@ pub fn setup() { tracing_wasm::set_as_global_default(); } -/// Helper method to quickly check if a Markdown string is likely intended to be English +/// Helper method to quickly check if a plain string is likely intended to be English #[wasm_bindgen] pub fn is_likely_english(text: String) -> bool { - let document = Document::new_markdown_curated(&text); + let document = Document::new_plain_english_curated(&text); is_doc_likely_english(&document, &FullDictionary::curated()) } +/// Helper method to remove non-English text from a plain English document. +#[wasm_bindgen] +pub fn isolate_english(text: String) -> String { + let dict = FullDictionary::curated(); + + let document = Document::new_curated( + &text, + &mut IsolateEnglish::new(Box::new(PlainEnglish), dict.clone()), + ); + + document.to_string() +} + #[wasm_bindgen] pub fn get_lint_config_as_object() -> JsValue { let linter = LINTER.lock().unwrap(); diff --git a/packages/web/src/lib/analysis.ts b/packages/web/src/lib/analysis.ts index a5b9f758..677904bd 100644 --- a/packages/web/src/lib/analysis.ts +++ b/packages/web/src/lib/analysis.ts @@ -31,3 +31,9 @@ export async function isLikelyEnglish(text: string): Promise { return wasm.is_likely_english(text); } + +export async function isolateEnglish(text: string): Promise { + const wasm = await import('wasm'); + + return wasm.isolate_english(text); +} diff --git a/packages/web/src/routes/languagedetection/+page.svelte b/packages/web/src/routes/languagedetection/+page.svelte index 0bd63081..cd6af588 100644 --- a/packages/web/src/routes/languagedetection/+page.svelte +++ b/packages/web/src/routes/languagedetection/+page.svelte @@ -1,12 +1,14 @@