From 748d9bb6a8f10e1c2e21c3e22a184b91a2a444e8 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Tue, 12 Nov 2024 13:58:12 -0700 Subject: [PATCH 1/7] feat(#253): remove requirement that word be in list to lint for repetition --- harper-core/src/linting/repeated_words.rs | 111 ++++++++++------------ 1 file changed, 49 insertions(+), 62 deletions(-) diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs index 4271442a..d377c0b2 100644 --- a/harper-core/src/linting/repeated_words.rs +++ b/harper-core/src/linting/repeated_words.rs @@ -1,66 +1,53 @@ -use super::{Lint, LintKind, PatternLinter, Suggestion}; -use crate::patterns::{Pattern, SequencePattern, WordPatternGroup}; -use crate::token::{Token, TokenStringExt}; - -pub struct RepeatedWords { - pattern: Box, -} - -impl RepeatedWords { - pub fn new() -> Self { - Self::default() - } -} - -impl Default for RepeatedWords { - fn default() -> Self { - let words = [ - "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not", - "on", "with", "he", "as", "you", "do", "at", "this", "is", "but", "his", "by", "from", - "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", - "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", - "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", - "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", - "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", - "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", - "want", "because", "any", "these", "give", "day", "most", "us", "are", - ]; - - let mut pattern = WordPatternGroup::default(); - - for word in words { - pattern.add( - word, - Box::new( - SequencePattern::default() - .then_exact_word(word) - .then_whitespace() - .then_exact_word(word), - ), - ); +use super::{Lint, LintKind, Linter, Suggestion}; +use crate::token::{Token, TokenKind, TokenStringExt}; +use crate::{Document, Span}; + +#[derive(Debug, Clone, Default)] +pub struct RepeatedWords; + +impl Linter for RepeatedWords { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + for chunk in document.iter_chunks() { + let mut iter = chunk.iter_word_indices().zip(chunk.iter_words()).peekable(); + + while let (Some((idx_a, tok_a)), Some((idx_b, tok_b))) = (iter.next(), iter.peek()) { + let word_a = document.get_span_content(tok_a.span); + let word_b = document.get_span_content(tok_b.span); + + if word_a == word_b { + let intervening_tokens = &chunk[idx_a + 1..*idx_b]; + + if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) { + continue; + } + + // Detect and remove the whitespace between the repetitions. + let remove_end = tok_b.span.end; + + let remove_start = if let Some(Token { + span, + kind: TokenKind::Space(_), + }) = intervening_tokens.last() + { + span.start + } else { + tok_b.span.start + }; + + lints.push(Lint { + span: Span::new(remove_start, remove_end), + lint_kind: LintKind::Repetition, + suggestions: vec![Suggestion::Remove], + message: "Did you mean to repeat this word?".to_string(), + ..Default::default() + }) + } + } } - Self { - pattern: Box::new(pattern), - } - } -} - -impl PatternLinter for RepeatedWords { - fn pattern(&self) -> &dyn Pattern { - self.pattern.as_ref() - } - - fn match_to_lint(&self, matched_tokens: &[Token], source: &[char]) -> Lint { - Lint { - span: matched_tokens.span().unwrap(), - lint_kind: LintKind::Repetition, - suggestions: vec![Suggestion::ReplaceWith( - matched_tokens[0].span.get_content(source).to_vec(), - )], - message: "Did you mean to repeat this word?".to_string(), - ..Default::default() - } + lints } } @@ -71,6 +58,6 @@ mod tests { #[test] fn catches_basic() { - assert_lint_count("I wanted the the banana.", RepeatedWords::new(), 1) + assert_lint_count("I wanted the the banana.", RepeatedWords::default(), 1) } } From 2ed85b71041fb813902d33f538882af0a461aaf8 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Tue, 12 Nov 2024 15:13:56 -0700 Subject: [PATCH 2/7] fix(#253): lint span both words, not just the second --- harper-core/src/linting/repeated_words.rs | 25 +++++++---------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs index d377c0b2..60c7c041 100644 --- a/harper-core/src/linting/repeated_words.rs +++ b/harper-core/src/linting/repeated_words.rs @@ -1,6 +1,6 @@ use super::{Lint, LintKind, Linter, Suggestion}; -use crate::token::{Token, TokenKind, TokenStringExt}; -use crate::{Document, Span}; +use crate::token::TokenStringExt; +use crate::{CharStringExt, Document, Span}; #[derive(Debug, Clone, Default)] pub struct RepeatedWords; @@ -16,30 +16,19 @@ impl Linter for RepeatedWords { let word_a = document.get_span_content(tok_a.span); let word_b = document.get_span_content(tok_b.span); - if word_a == word_b { + if word_a.to_lower() == word_b.to_lower() { let intervening_tokens = &chunk[idx_a + 1..*idx_b]; if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) { continue; } - // Detect and remove the whitespace between the repetitions. - let remove_end = tok_b.span.end; - - let remove_start = if let Some(Token { - span, - kind: TokenKind::Space(_), - }) = intervening_tokens.last() - { - span.start - } else { - tok_b.span.start - }; - lints.push(Lint { - span: Span::new(remove_start, remove_end), + span: Span::new(tok_a.span.start, tok_b.span.end), lint_kind: LintKind::Repetition, - suggestions: vec![Suggestion::Remove], + suggestions: vec![Suggestion::ReplaceWith( + document.get_span_content(tok_a.span).to_vec(), + )], message: "Did you mean to repeat this word?".to_string(), ..Default::default() }) From 890c1ff0cd7f3f32e4a67dcc1c3090aba29000d0 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Tue, 12 Nov 2024 16:39:11 -0700 Subject: [PATCH 3/7] fix(#253): exclude homophones from repetition lint --- harper-core/src/document.rs | 1 + harper-core/src/linting/repeated_words.rs | 8 +++++- harper-core/src/token.rs | 10 +++++++ harper-core/src/word_metadata.rs | 35 +++++++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 9ec2b328..c20b765c 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -502,6 +502,7 @@ impl TokenStringExt for Document { create_fns_on_doc!(sentence_terminator); create_fns_on_doc!(chunk_terminator); create_fns_on_doc!(punctuation); + create_fns_on_doc!(homophone); fn first_sentence_word(&self) -> Option { self.tokens.first_sentence_word() diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs index 60c7c041..f2b1e2f1 100644 --- a/harper-core/src/linting/repeated_words.rs +++ b/harper-core/src/linting/repeated_words.rs @@ -16,7 +16,7 @@ impl Linter for RepeatedWords { let word_a = document.get_span_content(tok_a.span); let word_b = document.get_span_content(tok_b.span); - if word_a.to_lower() == word_b.to_lower() { + if !tok_a.kind.is_homophone() && word_a.to_lower() == word_b.to_lower() { let intervening_tokens = &chunk[idx_a + 1..*idx_b]; if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) { @@ -49,4 +49,10 @@ mod tests { fn catches_basic() { assert_lint_count("I wanted the the banana.", RepeatedWords::default(), 1) } + + #[test] + fn does_not_lint_homophones() { + assert_lint_count("To address address problems.", RepeatedWords::default(), 0); + assert_lint_count("To record record profits.", RepeatedWords::default(), 0); + } } diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs index c4316ba2..9df4b170 100644 --- a/harper-core/src/token.rs +++ b/harper-core/src/token.rs @@ -323,6 +323,14 @@ impl TokenKind { metadata.is_noun() } + pub fn is_homophone(&self) -> bool { + let TokenKind::Word(metadata) = self else { + return false; + }; + + metadata.is_homophone() + } + pub fn is_comma(&self) -> bool { matches!(self, TokenKind::Punctuation(Punctuation::Comma)) } @@ -397,6 +405,7 @@ pub trait TokenStringExt { create_decl_for!(sentence_terminator); create_decl_for!(chunk_terminator); create_decl_for!(punctuation); + create_decl_for!(homophone); fn iter_linking_verb_indices(&self) -> impl Iterator + '_; fn iter_linking_verbs(&self) -> impl Iterator + '_; @@ -429,6 +438,7 @@ impl TokenStringExt for [Token] { create_fns_for!(unlintable); create_fns_for!(sentence_terminator); create_fns_for!(chunk_terminator); + create_fns_for!(homophone); fn first_non_whitespace(&self) -> Option { self.iter().find(|t| !t.kind.is_whitespace()).copied() diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index ad34cafd..b57abcb4 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -113,6 +113,41 @@ impl WordMetadata { ) } + /// Checks if the word is a homophone + /// This is based off of metadata, so it is not guaranteed to be correct + pub fn is_homophone(&self) -> bool { + [ + matches!( + self.noun, + Some(NounData { + is_proper: None, + is_plural: None, + is_possessive: None, + is_pronoun: None + }) + ), + matches!( + self.verb, + Some(VerbData { + is_linking: None, + tense: None + }) + ), + self.is_conjunction(), + self.is_adjective(), + self.is_adverb(), + self.is_possessive_noun(), + self.is_plural_noun(), + self.is_proper_noun(), + self.is_pronoun(), + self.is_linking_verb(), + ] + .iter() + .map(|b| *b as u8) + .sum::() + > 1 + } + /// Checks whether a word is _definitely_ a swear. pub fn is_swear(&self) -> bool { matches!(self.swear, Some(true)) From 4a51cc3884f863bffba34f5966e3696b7898dd13 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Sun, 17 Nov 2024 10:37:03 -0700 Subject: [PATCH 4/7] fix(#253): correct homophone to homograph and separate test --- harper-core/src/document.rs | 2 +- harper-core/src/linting/repeated_words.rs | 8 ++++++-- harper-core/src/token.rs | 8 ++++---- harper-core/src/word_metadata.rs | 4 ++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index c20b765c..f7c121f7 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -502,7 +502,7 @@ impl TokenStringExt for Document { create_fns_on_doc!(sentence_terminator); create_fns_on_doc!(chunk_terminator); create_fns_on_doc!(punctuation); - create_fns_on_doc!(homophone); + create_fns_on_doc!(likely_homograph); fn first_sentence_word(&self) -> Option { self.tokens.first_sentence_word() diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs index f2b1e2f1..bacb7269 100644 --- a/harper-core/src/linting/repeated_words.rs +++ b/harper-core/src/linting/repeated_words.rs @@ -16,7 +16,7 @@ impl Linter for RepeatedWords { let word_a = document.get_span_content(tok_a.span); let word_b = document.get_span_content(tok_b.span); - if !tok_a.kind.is_homophone() && word_a.to_lower() == word_b.to_lower() { + if !tok_a.kind.is_likely_homograph() && word_a.to_lower() == word_b.to_lower() { let intervening_tokens = &chunk[idx_a + 1..*idx_b]; if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) { @@ -51,8 +51,12 @@ mod tests { } #[test] - fn does_not_lint_homophones() { + fn does_not_lint_homographs_address() { assert_lint_count("To address address problems.", RepeatedWords::default(), 0); + } + + #[test] + fn does_not_lint_homographs_record() { assert_lint_count("To record record profits.", RepeatedWords::default(), 0); } } diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs index 9df4b170..3da5722d 100644 --- a/harper-core/src/token.rs +++ b/harper-core/src/token.rs @@ -323,12 +323,12 @@ impl TokenKind { metadata.is_noun() } - pub fn is_homophone(&self) -> bool { + pub fn is_likely_homograph(&self) -> bool { let TokenKind::Word(metadata) = self else { return false; }; - metadata.is_homophone() + metadata.is_likely_homograph() } pub fn is_comma(&self) -> bool { @@ -405,7 +405,7 @@ pub trait TokenStringExt { create_decl_for!(sentence_terminator); create_decl_for!(chunk_terminator); create_decl_for!(punctuation); - create_decl_for!(homophone); + create_decl_for!(likely_homograph); fn iter_linking_verb_indices(&self) -> impl Iterator + '_; fn iter_linking_verbs(&self) -> impl Iterator + '_; @@ -438,7 +438,7 @@ impl TokenStringExt for [Token] { create_fns_for!(unlintable); create_fns_for!(sentence_terminator); create_fns_for!(chunk_terminator); - create_fns_for!(homophone); + create_fns_for!(likely_homograph); fn first_non_whitespace(&self) -> Option { self.iter().find(|t| !t.kind.is_whitespace()).copied() diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index b57abcb4..e1e0060f 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -113,9 +113,9 @@ impl WordMetadata { ) } - /// Checks if the word is a homophone + /// Checks if the word is a homograph /// This is based off of metadata, so it is not guaranteed to be correct - pub fn is_homophone(&self) -> bool { + pub fn is_likely_homograph(&self) -> bool { [ matches!( self.noun, From cd92a7a70ce7ac95bb4456a62dee44040bdadc07 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Sat, 23 Nov 2024 12:55:41 -0700 Subject: [PATCH 5/7] feat: generate `WordMetadata` queries with a macro --- harper-core/src/word_metadata.rs | 161 +++++++++++-------------------- 1 file changed, 57 insertions(+), 104 deletions(-) diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index e1e0060f..314f855d 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -1,4 +1,5 @@ use is_macro::Is; +use paste::paste; use serde::{Deserialize, Serialize}; #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)] @@ -14,10 +15,59 @@ pub struct WordMetadata { pub common: bool, } +/// Needed for `serde` fn default_common() -> bool { false } +macro_rules! generate_metadata_queries { + ($($category:ident has $($sub:ident),*).*) => { + paste! { + pub fn is_likely_homograph(&self) -> bool { + if [$($(self.[< is_ $sub _ $category >](),)*)*].iter().map(|b| *b as u8).sum::() > 1 { + return true; + } + + [$( + self.[< is_ $category >](), + )*].iter().map(|b| *b as u8).sum::() > 1 + } + + $( + #[doc = concat!("Checks if the word is definitely a ", stringify!($category), ".")] + pub fn [< is_ $category >](&self) -> bool { + self.$category.is_some() + } + + $( + #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as (a) ", stringify!($sub), ".")] + pub fn [< is_ $sub _ $category >](&self) -> bool { + matches!( + self.$category, + Some([< $category:camel Data >]{ + [< is_ $sub >]: Some(true), + .. + }) + ) + } + + + #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as __not__ (a) ", stringify!($sub), ".")] + pub fn [< is_not_ $sub _ $category >](&self) -> bool { + matches!( + self.$category, + Some([< $category:camel Data >]{ + [< is_ $sub >]: Some(false), + .. + }) + ) + } + )* + )* + } + }; +} + impl WordMetadata { /// Produce a copy of `self` with the known properties of `other` set. pub fn or(&self, other: &Self) -> Self { @@ -43,110 +93,13 @@ impl WordMetadata { } } - pub fn is_noun(&self) -> bool { - self.noun.is_some() - } - - pub fn is_conjunction(&self) -> bool { - self.conjunction.is_some() - } - - pub fn is_verb(&self) -> bool { - self.verb.is_some() - } - - pub fn is_adjective(&self) -> bool { - self.adjective.is_some() - } - - pub fn is_adverb(&self) -> bool { - self.adverb.is_some() - } - - pub fn is_possessive_noun(&self) -> bool { - matches!( - self.noun, - Some(NounData { - is_possessive: Some(true), - .. - }) - ) - } - - pub fn is_plural_noun(&self) -> bool { - matches!( - self.noun, - Some(NounData { - is_plural: Some(true), - .. - }) - ) - } - - pub fn is_proper_noun(&self) -> bool { - matches!( - self.noun, - Some(NounData { - is_proper: Some(true), - .. - }) - ) - } - - pub fn is_pronoun(&self) -> bool { - matches!( - self.noun, - Some(NounData { - is_pronoun: Some(true), - .. - }) - ) - } - - pub fn is_linking_verb(&self) -> bool { - matches!( - self.verb, - Some(VerbData { - is_linking: Some(true), - .. - }) - ) - } - - /// Checks if the word is a homograph - /// This is based off of metadata, so it is not guaranteed to be correct - pub fn is_likely_homograph(&self) -> bool { - [ - matches!( - self.noun, - Some(NounData { - is_proper: None, - is_plural: None, - is_possessive: None, - is_pronoun: None - }) - ), - matches!( - self.verb, - Some(VerbData { - is_linking: None, - tense: None - }) - ), - self.is_conjunction(), - self.is_adjective(), - self.is_adverb(), - self.is_possessive_noun(), - self.is_plural_noun(), - self.is_proper_noun(), - self.is_pronoun(), - self.is_linking_verb(), - ] - .iter() - .map(|b| *b as u8) - .sum::() - > 1 - } + generate_metadata_queries!( + noun has proper, plural, possessive, pronoun. + verb has linking. + conjunction has. + adjective has. + adverb has + ); /// Checks whether a word is _definitely_ a swear. pub fn is_swear(&self) -> bool { From ef6babf31b4fb61fee835035159741c32d3b6c6e Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Sat, 23 Nov 2024 13:11:27 -0700 Subject: [PATCH 6/7] style: remove redundant calls to `::default()` --- harper-core/src/linting/repeated_words.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs index bacb7269..be9d0bb2 100644 --- a/harper-core/src/linting/repeated_words.rs +++ b/harper-core/src/linting/repeated_words.rs @@ -47,16 +47,16 @@ mod tests { #[test] fn catches_basic() { - assert_lint_count("I wanted the the banana.", RepeatedWords::default(), 1) + assert_lint_count("I wanted the the banana.", RepeatedWords, 1) } #[test] fn does_not_lint_homographs_address() { - assert_lint_count("To address address problems.", RepeatedWords::default(), 0); + assert_lint_count("To address address problems.", RepeatedWords, 0); } #[test] fn does_not_lint_homographs_record() { - assert_lint_count("To record record profits.", RepeatedWords::default(), 0); + assert_lint_count("To record record profits.", RepeatedWords, 0); } } From f8064aef4e22048d1ec0ad25eb4e4d3514d9465f Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Sat, 23 Nov 2024 13:26:32 -0700 Subject: [PATCH 7/7] test: fix test broken by merge --- harper-core/src/spell/full_dictionary.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/harper-core/src/spell/full_dictionary.rs b/harper-core/src/spell/full_dictionary.rs index a11cd516..21e7476e 100644 --- a/harper-core/src/spell/full_dictionary.rs +++ b/harper-core/src/spell/full_dictionary.rs @@ -300,8 +300,8 @@ mod tests { #[test] fn herself_is_pronoun() { let dict = FullDictionary::curated(); - assert!(dict.get_word_metadata_str("herself").is_pronoun()); - assert!(dict.get_word_metadata_str("Herself").is_pronoun()); + assert!(dict.get_word_metadata_str("herself").is_pronoun_noun()); + assert!(dict.get_word_metadata_str("Herself").is_pronoun_noun()); } #[test]