-
Notifications
You must be signed in to change notification settings - Fork 78
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Collapse identifiers seperated by hyphens or underscores if they exist in the file dictionary #199
feat: Collapse identifiers seperated by hyphens or underscores if they exist in the file dictionary #199
Changes from all commits
6ff8037
903f1c6
24775ba
def8291
b58eac3
09d6f97
b1c4f95
1f74f30
ef33e85
3e322c9
498a38f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
use crate::Lrc; | ||
use std::collections::VecDeque; | ||
|
||
use itertools::Itertools; | ||
|
||
use super::{Parser, TokenKind}; | ||
use crate::patterns::{PatternExt, SequencePattern}; | ||
use crate::{Dictionary, FullDictionary, MergedDictionary, Span, Token, VecExt, WordMetadata}; | ||
|
||
/// A parser that wraps any other parser to collapse token strings that match | ||
/// the pattern word_word or word-word. | ||
pub struct CollapseIdentifiers { | ||
inner: Box<dyn Parser>, | ||
dict: Lrc<MergedDictionary<FullDictionary>>, | ||
} | ||
|
||
impl CollapseIdentifiers { | ||
pub fn new(inner: Box<dyn Parser>, dict: &Lrc<MergedDictionary<FullDictionary>>) -> Self { | ||
Self { | ||
inner, | ||
dict: dict.clone(), | ||
} | ||
} | ||
} | ||
|
||
thread_local! { | ||
static WORD_OR_NUMBER: Lrc<SequencePattern> = Lrc::new(SequencePattern::default() | ||
.then_any_word() | ||
.then_one_or_more(Box::new(SequencePattern::default() | ||
.then_case_separator() | ||
.then_any_word()))); | ||
} | ||
|
||
impl Parser for CollapseIdentifiers { | ||
fn parse(&mut self, source: &[char]) -> Vec<Token> { | ||
let mut tokens = self.inner.parse(source); | ||
|
||
let mut removal_indexes: VecDeque<usize> = VecDeque::default(); | ||
let replacements = WORD_OR_NUMBER | ||
.with(|v| v.clone()) | ||
.find_all_matches(&tokens, source) | ||
.into_iter() | ||
.map(|s| { | ||
let start_tok = tokens | ||
.get(s.start) | ||
.expect("Token not at expected position."); | ||
let end_tok = tokens | ||
.get(s.end - 1) | ||
.expect("Token not at expected position."); | ||
let char_span = Span::new(start_tok.span.start, end_tok.span.end); | ||
( | ||
s.start, | ||
s.end, | ||
Token::new(char_span, TokenKind::Word(WordMetadata::default())), | ||
char_span.get_content_string(source), | ||
) | ||
}) | ||
.filter(|(_, _, _, st)| self.dict.contains_word_str(st)) | ||
.collect_vec(); | ||
|
||
replacements.into_iter().for_each(|(s, e, t, _)| { | ||
(s + 1..=e).for_each(|n| removal_indexes.push_front(n)); | ||
tokens[s] = t; | ||
}); | ||
tokens.remove_indices(removal_indexes.into_iter().sorted().unique().collect()); | ||
|
||
tokens | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::parsers::{PlainEnglish, StrParser}; | ||
|
||
use super::*; | ||
|
||
#[test] | ||
fn no_collapse() { | ||
let dict = FullDictionary::curated(); | ||
let source = "This is a test."; | ||
|
||
let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(dict.into())) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 8); | ||
} | ||
|
||
#[test] | ||
fn one_collapse() { | ||
let source = "This is a separated_identifier, wow!"; | ||
let default_dict = FullDictionary::curated(); | ||
|
||
let tokens = CollapseIdentifiers::new( | ||
Box::new(PlainEnglish), | ||
&Lrc::new(default_dict.clone().into()), | ||
) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 13); | ||
|
||
let mut dict = FullDictionary::new(); | ||
dict.append_word( | ||
"separated_identifier".chars().collect_vec(), | ||
WordMetadata::default(), | ||
); | ||
|
||
let mut merged_dict = MergedDictionary::from(default_dict); | ||
merged_dict.add_dictionary(Lrc::new(dict)); | ||
|
||
let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict)) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 10); | ||
} | ||
|
||
#[test] | ||
fn kebab_collapse() { | ||
let source = "This is a separated-identifier, wow!"; | ||
let default_dict = FullDictionary::curated(); | ||
|
||
let tokens = CollapseIdentifiers::new( | ||
Box::new(PlainEnglish), | ||
&Lrc::new(default_dict.clone().into()), | ||
) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 13); | ||
|
||
let mut dict = FullDictionary::new(); | ||
dict.append_word( | ||
"separated-identifier".chars().collect_vec(), | ||
WordMetadata::default(), | ||
); | ||
|
||
let mut merged_dict = MergedDictionary::from(default_dict); | ||
merged_dict.add_dictionary(Lrc::new(dict)); | ||
|
||
let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict)) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 10); | ||
} | ||
|
||
#[test] | ||
fn double_collapse() { | ||
let source = "This is a separated_identifier_token, wow!"; | ||
let default_dict = FullDictionary::curated(); | ||
|
||
let tokens = CollapseIdentifiers::new( | ||
Box::new(PlainEnglish), | ||
&Lrc::new(default_dict.clone().into()), | ||
) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 15); | ||
|
||
let mut dict = FullDictionary::new(); | ||
dict.append_word( | ||
"separated_identifier_token".chars().collect_vec(), | ||
WordMetadata::default(), | ||
); | ||
|
||
let mut merged_dict = MergedDictionary::from(default_dict); | ||
merged_dict.add_dictionary(Lrc::new(dict)); | ||
|
||
let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict)) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 10); | ||
} | ||
|
||
#[test] | ||
fn two_collapses() { | ||
let source = "This is a separated_identifier, wow! separated_identifier"; | ||
let default_dict = FullDictionary::curated(); | ||
|
||
let tokens = CollapseIdentifiers::new( | ||
Box::new(PlainEnglish), | ||
&Lrc::new(default_dict.clone().into()), | ||
) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 17); | ||
|
||
let mut dict = FullDictionary::new(); | ||
dict.append_word( | ||
"separated_identifier".chars().collect_vec(), | ||
WordMetadata::default(), | ||
); | ||
|
||
let mut merged_dict = MergedDictionary::from(default_dict); | ||
merged_dict.add_dictionary(Lrc::new(dict)); | ||
|
||
let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict)) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 12); | ||
} | ||
|
||
#[test] | ||
fn overlapping_identifiers() { | ||
let source = "This is a separated_identifier_token, wow!"; | ||
let default_dict = FullDictionary::curated(); | ||
|
||
let tokens = CollapseIdentifiers::new( | ||
Box::new(PlainEnglish), | ||
&Lrc::new(default_dict.clone().into()), | ||
) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 15); | ||
|
||
let mut dict = FullDictionary::new(); | ||
dict.append_word( | ||
"separated_identifier".chars().collect_vec(), | ||
WordMetadata::default(), | ||
); | ||
dict.append_word( | ||
"identifier_token".chars().collect_vec(), | ||
WordMetadata::default(), | ||
); | ||
|
||
let mut merged_dict = MergedDictionary::from(default_dict); | ||
merged_dict.add_dictionary(Lrc::new(dict)); | ||
|
||
let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict)) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 15); | ||
} | ||
|
||
#[test] | ||
fn nested_identifiers() { | ||
let source = "This is a separated_identifier_token, wow!"; | ||
let default_dict = FullDictionary::curated(); | ||
|
||
let tokens = CollapseIdentifiers::new( | ||
Box::new(PlainEnglish), | ||
&Lrc::new(default_dict.clone().into()), | ||
) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 15); | ||
|
||
let mut dict = FullDictionary::new(); | ||
dict.append_word( | ||
"separated_identifier_token".chars().collect_vec(), | ||
WordMetadata::default(), | ||
); | ||
dict.append_word( | ||
"separated_identifier".chars().collect_vec(), | ||
WordMetadata::default(), | ||
); | ||
|
||
let mut merged_dict = MergedDictionary::from(default_dict); | ||
merged_dict.add_dictionary(Lrc::new(dict)); | ||
|
||
let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict)) | ||
.parse_str(source); | ||
assert_eq!(tokens.len(), 10); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -242,6 +242,14 @@ impl TokenKind { | |
matches!(self, TokenKind::Punctuation(Punctuation::At)) | ||
} | ||
|
||
/// Used by `crate::parsers::CollapseIdentifiers` | ||
/// TODO: Separate this into two functions and add OR functionality to | ||
/// pattern matching | ||
pub fn is_case_separator(&self) -> bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be two separate functions, but it's probably OK for now. If someone really wants it I can figure out how to do OR in pattern matching. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's probably fine to leave this here, but if you do, add a doc-comment that explains what a "case separator" is and why it's needed. I would much rather create an OR |
||
matches!(self, TokenKind::Punctuation(Punctuation::Underscore)) | ||
|| matches!(self, TokenKind::Punctuation(Punctuation::Hyphen)) | ||
} | ||
|
||
pub fn is_verb(&self) -> bool { | ||
let TokenKind::Word(metadata) = self else { | ||
return false; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm surprised this wasn't already here. How was
harper-cli
able to compile?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Idk, I just checked, and it turns out
mask::Masker
uses Sync too and doesn't have a concurrent cfg.