Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Collapse identifiers seperated by hyphens or underscores if they exist in the file dictionary #199

Merged
merged 11 commits into from
Oct 4, 2024
Merged
250 changes: 250 additions & 0 deletions harper-core/src/parsers/collapse_identifiers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
use crate::Lrc;
use std::collections::VecDeque;

use itertools::Itertools;

use super::{Parser, TokenKind};
use crate::patterns::{PatternExt, SequencePattern};
use crate::{Dictionary, FullDictionary, MergedDictionary, Span, Token, VecExt, WordMetadata};

/// A parser that wraps any other parser to collapse token strings that match
/// the pattern word_word or word-word.
pub struct CollapseIdentifiers {
inner: Box<dyn Parser>,
dict: Lrc<MergedDictionary<FullDictionary>>,
}

impl CollapseIdentifiers {
pub fn new(inner: Box<dyn Parser>, dict: &Lrc<MergedDictionary<FullDictionary>>) -> Self {
Self {
inner,
dict: dict.clone(),
}
}
}

thread_local! {
static WORD_OR_NUMBER: Lrc<SequencePattern> = Lrc::new(SequencePattern::default()
.then_any_word()
.then_one_or_more(Box::new(SequencePattern::default()
.then_case_separator()
.then_any_word())));
}

impl Parser for CollapseIdentifiers {
fn parse(&mut self, source: &[char]) -> Vec<Token> {
let mut tokens = self.inner.parse(source);

let mut removal_indexes: VecDeque<usize> = VecDeque::default();
let replacements = WORD_OR_NUMBER
.with(|v| v.clone())
.find_all_matches(&tokens, source)
.into_iter()
.map(|s| {
let start_tok = tokens
.get(s.start)
.expect("Token not at expected position.");
let end_tok = tokens
.get(s.end - 1)
.expect("Token not at expected position.");
let char_span = Span::new(start_tok.span.start, end_tok.span.end);
(
s.start,
s.end,
Token::new(char_span, TokenKind::Word(WordMetadata::default())),
char_span.get_content_string(source),
)
})
.filter(|(_, _, _, st)| self.dict.contains_word_str(st))
.collect_vec();

replacements.into_iter().for_each(|(s, e, t, _)| {
(s + 1..=e).for_each(|n| removal_indexes.push_front(n));
tokens[s] = t;
});
tokens.remove_indices(removal_indexes.into_iter().sorted().unique().collect());

tokens
}
}

#[cfg(test)]
mod tests {
use crate::parsers::{PlainEnglish, StrParser};

use super::*;

#[test]
fn no_collapse() {
let dict = FullDictionary::curated();
let source = "This is a test.";

let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(dict.into()))
.parse_str(source);
assert_eq!(tokens.len(), 8);
}

#[test]
fn one_collapse() {
let source = "This is a separated_identifier, wow!";
let default_dict = FullDictionary::curated();

let tokens = CollapseIdentifiers::new(
Box::new(PlainEnglish),
&Lrc::new(default_dict.clone().into()),
)
.parse_str(source);
assert_eq!(tokens.len(), 13);

let mut dict = FullDictionary::new();
dict.append_word(
"separated_identifier".chars().collect_vec(),
WordMetadata::default(),
);

let mut merged_dict = MergedDictionary::from(default_dict);
merged_dict.add_dictionary(Lrc::new(dict));

let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict))
.parse_str(source);
assert_eq!(tokens.len(), 10);
}

#[test]
fn kebab_collapse() {
let source = "This is a separated-identifier, wow!";
let default_dict = FullDictionary::curated();

let tokens = CollapseIdentifiers::new(
Box::new(PlainEnglish),
&Lrc::new(default_dict.clone().into()),
)
.parse_str(source);
assert_eq!(tokens.len(), 13);

let mut dict = FullDictionary::new();
dict.append_word(
"separated-identifier".chars().collect_vec(),
WordMetadata::default(),
);

let mut merged_dict = MergedDictionary::from(default_dict);
merged_dict.add_dictionary(Lrc::new(dict));

let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict))
.parse_str(source);
assert_eq!(tokens.len(), 10);
}

#[test]
fn double_collapse() {
let source = "This is a separated_identifier_token, wow!";
let default_dict = FullDictionary::curated();

let tokens = CollapseIdentifiers::new(
Box::new(PlainEnglish),
&Lrc::new(default_dict.clone().into()),
)
.parse_str(source);
assert_eq!(tokens.len(), 15);

let mut dict = FullDictionary::new();
dict.append_word(
"separated_identifier_token".chars().collect_vec(),
WordMetadata::default(),
);

let mut merged_dict = MergedDictionary::from(default_dict);
merged_dict.add_dictionary(Lrc::new(dict));

let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict))
.parse_str(source);
assert_eq!(tokens.len(), 10);
}

#[test]
fn two_collapses() {
let source = "This is a separated_identifier, wow! separated_identifier";
let default_dict = FullDictionary::curated();

let tokens = CollapseIdentifiers::new(
Box::new(PlainEnglish),
&Lrc::new(default_dict.clone().into()),
)
.parse_str(source);
assert_eq!(tokens.len(), 17);

let mut dict = FullDictionary::new();
dict.append_word(
"separated_identifier".chars().collect_vec(),
WordMetadata::default(),
);

let mut merged_dict = MergedDictionary::from(default_dict);
merged_dict.add_dictionary(Lrc::new(dict));

let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict))
.parse_str(source);
assert_eq!(tokens.len(), 12);
}

#[test]
fn overlapping_identifiers() {
let source = "This is a separated_identifier_token, wow!";
let default_dict = FullDictionary::curated();

let tokens = CollapseIdentifiers::new(
Box::new(PlainEnglish),
&Lrc::new(default_dict.clone().into()),
)
.parse_str(source);
assert_eq!(tokens.len(), 15);

let mut dict = FullDictionary::new();
dict.append_word(
"separated_identifier".chars().collect_vec(),
WordMetadata::default(),
);
dict.append_word(
"identifier_token".chars().collect_vec(),
WordMetadata::default(),
);

let mut merged_dict = MergedDictionary::from(default_dict);
merged_dict.add_dictionary(Lrc::new(dict));

let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict))
.parse_str(source);
assert_eq!(tokens.len(), 15);
}

#[test]
fn nested_identifiers() {
let source = "This is a separated_identifier_token, wow!";
let default_dict = FullDictionary::curated();

let tokens = CollapseIdentifiers::new(
Box::new(PlainEnglish),
&Lrc::new(default_dict.clone().into()),
)
.parse_str(source);
assert_eq!(tokens.len(), 15);

let mut dict = FullDictionary::new();
dict.append_word(
"separated_identifier_token".chars().collect_vec(),
WordMetadata::default(),
);
dict.append_word(
"separated_identifier".chars().collect_vec(),
WordMetadata::default(),
);

let mut merged_dict = MergedDictionary::from(default_dict);
merged_dict.add_dictionary(Lrc::new(dict));

let tokens = CollapseIdentifiers::new(Box::new(PlainEnglish), &Lrc::new(merged_dict))
.parse_str(source);
assert_eq!(tokens.len(), 10);
}
}
9 changes: 9 additions & 0 deletions harper-core/src/parsers/mod.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
mod collapse_identifiers;
mod markdown;
mod mask;
mod plain_english;

use blanket::blanket;
pub use collapse_identifiers::CollapseIdentifiers;
pub use markdown::Markdown;
pub use mask::Mask;
pub use plain_english::PlainEnglish;

pub use crate::token::{Token, TokenKind, TokenStringExt};

#[cfg(not(feature = "concurrent"))]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised this wasn't already here. How was harper-cli able to compile?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Idk, I just checked, and it turns out mask::Masker uses Sync too and doesn't have a concurrent cfg.

#[blanket(derive(Box))]
pub trait Parser {
fn parse(&mut self, source: &[char]) -> Vec<Token>;
}

#[cfg(feature = "concurrent")]
#[blanket(derive(Box))]
pub trait Parser: Send + Sync {
fn parse(&mut self, source: &[char]) -> Vec<Token>;
Expand Down
1 change: 1 addition & 0 deletions harper-core/src/patterns/sequence_pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ impl SequencePattern {
gen_then_from_is!(conjunction);
gen_then_from_is!(comma);
gen_then_from_is!(period);
gen_then_from_is!(case_separator);

pub fn then_exact_word(mut self, word: &'static str) -> Self {
self.token_patterns
Expand Down
18 changes: 15 additions & 3 deletions harper-core/src/spell/merged_dictionary.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::sync::Arc;
use crate::Lrc;

use hashbrown::HashMap;

Expand All @@ -12,7 +12,7 @@ pub struct MergedDictionary<T>
where
T: Dictionary + Clone,
{
children: Vec<Arc<T>>,
children: Vec<Lrc<T>>,
merged: HashMap<CharString, WordMetadata>,
}

Expand All @@ -27,11 +27,23 @@ where
}
}

pub fn add_dictionary(&mut self, dictionary: Arc<T>) {
pub fn add_dictionary(&mut self, dictionary: Lrc<T>) {
self.children.push(dictionary.clone());
}
}

impl<T> From<Lrc<T>> for MergedDictionary<T>
where
T: Dictionary + Clone,
{
fn from(value: Lrc<T>) -> Self {
Self {
children: vec![value],
..Default::default()
}
}
}

impl<T> Default for MergedDictionary<T>
where
T: Dictionary + Clone,
Expand Down
8 changes: 8 additions & 0 deletions harper-core/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,14 @@ impl TokenKind {
matches!(self, TokenKind::Punctuation(Punctuation::At))
}

/// Used by `crate::parsers::CollapseIdentifiers`
/// TODO: Separate this into two functions and add OR functionality to
/// pattern matching
pub fn is_case_separator(&self) -> bool {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be two separate functions, but it's probably OK for now. If someone really wants it I can figure out how to do OR in pattern matching.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably fine to leave this here, but if you do, add a doc-comment that explains what a "case separator" is and why it's needed.

I would much rather create an OR Pattern and use that though, since we need one anyway. It's up to you which you decide to do (laziness is totally fine in this case).

matches!(self, TokenKind::Punctuation(Punctuation::Underscore))
|| matches!(self, TokenKind::Punctuation(Punctuation::Hyphen))
}

pub fn is_verb(&self) -> bool {
let TokenKind::Word(metadata) = self else {
return false;
Expand Down
11 changes: 8 additions & 3 deletions harper-ls/src/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::sync::Arc;
use anyhow::anyhow;
use harper_comments::CommentParser;
use harper_core::linting::{LintGroup, Linter};
use harper_core::parsers::{Markdown, PlainEnglish};
use harper_core::parsers::{CollapseIdentifiers, Markdown, PlainEnglish};
use harper_core::{
Dictionary, Document, FullDictionary, MergedDictionary, Token, TokenKind, WordMetadata,
};
Expand Down Expand Up @@ -199,9 +199,14 @@ impl Backend {
doc_state.linter = LintGroup::new(config_lock.lint_config, merged.clone());
doc_state.dict = merged.clone();
}
Document::new_from_vec(
source,
&mut CollapseIdentifiers::new(Box::new(ts_parser), &doc_state.dict),
&doc_state.dict,
)
} else {
Document::new_from_vec(source, &mut ts_parser, &doc_state.dict)
}

Document::new_from_vec(source, &mut ts_parser, &doc_state.dict)
} else if language_id == "markdown" {
Document::new(text, &mut Markdown, &doc_state.dict)
} else if language_id == "gitcommit" {
Expand Down
Loading