diff --git a/hipcheck/src/analysis/mod.rs b/hipcheck/src/analysis/mod.rs index 68117934..c387c9ad 100644 --- a/hipcheck/src/analysis/mod.rs +++ b/hipcheck/src/analysis/mod.rs @@ -11,7 +11,6 @@ use crate::{ F64, }; use std::{ - collections::HashSet, default::Default, }; @@ -20,47 +19,15 @@ use std::{ pub trait AnalysisProvider: AttacksConfigQuery + CommitConfigQuery + GitProvider + MetricProvider + PracticesConfigQuery { - /// Returns result of churn analysis - fn churn_analysis(&self) -> Result; - - /// Returns result of entropy analysis - fn entropy_analysis(&self) -> Result; - /// Returns result of identity analysis fn identity_analysis(&self) -> Result; /// Returns result of fuzz analysis fn fuzz_analysis(&self) -> Result; - - /// Returns result of typo analysis - fn typo_analysis(&self) -> Result; } - -pub fn churn_analysis(db: &dyn AnalysisProvider) -> Result { - let results = db.churn_metric()?; - let value: Vec = results.commit_churn_freqs.iter().map(|o| o.churn).collect(); - // @Todo - in RFD4 transition we lost the ability to flag commits, because - // the need to flag them as concerns is dependent on policy expr - Ok(QueryResult { - value: serde_json::to_value(value)?, - concerns: vec![], - }) -} - -pub fn entropy_analysis(db: &dyn AnalysisProvider) -> Result { - let results = db.entropy_metric()?; - let value: Vec = results.commit_entropies.iter().map(|o| o.entropy).collect(); - // @Todo - in RFD4 transition we lost the ability to flag commits, because - // the need to flag them as concerns is dependent on policy expr - Ok(QueryResult { - value: serde_json::to_value(value)?, - concerns: vec![], - }) -} - pub fn identity_analysis(db: &dyn AnalysisProvider) -> Result { let results = db.identity_metric()?; let num_flagged = results @@ -83,25 +50,4 @@ pub fn fuzz_analysis(db: &dyn AnalysisProvider) -> Result { value: serde_json::to_value(value)?, concerns: vec![], }) -} - - -pub fn typo_analysis(db: &dyn AnalysisProvider) -> Result { - let results = db.typo_metric()?; - - // @Note - policy expr json injection does not support string/obj as array elts - let value = results.typos.iter().map(|_| true).collect::>(); - - let concerns: Vec = results - .typos - .iter() - .map(|typodep| typodep.dependency.to_string()) - .collect::>() - .into_iter() - .collect(); - - Ok(QueryResult { - value: serde_json::to_value(value)?, - concerns, - }) -} +} \ No newline at end of file diff --git a/hipcheck/src/main.rs b/hipcheck/src/main.rs index e23a8562..f1e1afde 100644 --- a/hipcheck/src/main.rs +++ b/hipcheck/src/main.rs @@ -39,7 +39,6 @@ use crate::{ session::Session, setup::{resolve_and_transform_source, SourceType}, shell::Shell, - util::iter::{TryAny, TryFilter}, }; use cli::{ CacheArgs, CacheOp, CheckArgs, CliConfig, FullCommands, PluginArgs, SchemaArgs, SchemaCommand, diff --git a/hipcheck/src/metric/churn.rs b/hipcheck/src/metric/churn.rs deleted file mode 100644 index ecdcdd7a..00000000 --- a/hipcheck/src/metric/churn.rs +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - data::git::Commit, - error::{Context as _, Result}, - hc_error, - metric::{ - math::{mean, std_dev}, - MetricProvider, - }, - TryAny, TryFilter, F64, -}; -use serde::Serialize; -use std::{collections::HashMap, sync::Arc}; - -#[derive(Debug, Eq, PartialEq, Serialize)] -pub struct ChurnOutput { - pub commit_churn_freqs: Vec, -} - -#[derive(Debug, Clone, Eq, PartialEq, Serialize)] -pub struct CommitChurnFreq { - pub commit: Arc, - pub churn: F64, -} - -pub fn churn_metric(db: &dyn MetricProvider) -> Result> { - log::debug!("running churn metric"); - - let commit_diffs = db.commit_diffs().context("failed to get commit diffs")?; - - let commit_diffs = commit_diffs - .iter() - .try_filter(|cd| { - cd.diff - .file_diffs - .iter() - .try_any(|fd| db.is_likely_source_file(Arc::clone(&fd.file_name))) - }) - .collect::>>()?; - - let mut commit_churns = Vec::new(); - let mut total_files_changed: i64 = 0; - let mut total_lines_changed: i64 = 0; - - for commit_diff in commit_diffs { - let source_files = commit_diff - .diff - .file_diffs - .iter() - .try_filter(|file_diff| db.is_likely_source_file(Arc::clone(&file_diff.file_name))) - .collect::>>()?; - - // Update files changed. - let files_changed = source_files.len() as i64; - total_files_changed += files_changed; - - // Update lines changed. - let mut lines_changed: i64 = 0; - for file_diff in &source_files { - lines_changed += file_diff - .additions - .ok_or_else(|| hc_error!("GitHub commits can't be used for churn"))?; - lines_changed += file_diff - .deletions - .ok_or_else(|| hc_error!("GitHub commits can't be used for churn"))?; - } - total_lines_changed += lines_changed; - - commit_churns.push(CommitChurn { - commit: Arc::clone(&commit_diff.commit), - files_changed, - lines_changed, - }); - } - - let mut commit_churn_freqs: Vec<_> = { - let file_frequencies: HashMap<&str, f64> = commit_churns - .iter() - .map(|commit_churn| { - // avoid dividing by zero. - if total_files_changed == 0 { - (commit_churn.commit.hash.as_ref(), 0.0) - } else { - ( - commit_churn.commit.hash.as_ref(), - commit_churn.files_changed as f64 / total_files_changed as f64, - ) - } - }) - .collect(); - - let line_frequencies: HashMap<&str, f64> = commit_churns - .iter() - .map(|commit_churn| { - // avoid dividing by zero. - if total_lines_changed == 0 { - (commit_churn.commit.hash.as_ref(), 0.0) - } else { - ( - commit_churn.commit.hash.as_ref(), - commit_churn.lines_changed as f64 / total_lines_changed as f64, - ) - } - }) - .collect(); - - commit_churns - .iter() - .map(|commit_churn| { - let hash: &str = commit_churn.commit.hash.as_ref(); - let file_frequency = file_frequencies[hash]; - let line_frequency = line_frequencies[hash]; - // PANIC: Safe to unwrap, beacuse we are creating a valid floating point number - let churn = - F64::new(file_frequency * line_frequency * line_frequency * 1_000_000.0) - .unwrap(); - - CommitChurnFreq { - commit: Arc::clone(&commit_churn.commit), - churn, - } - }) - .collect() - }; - - let churns: Vec<_> = commit_churn_freqs - .iter() - .map(|c| c.churn.into_inner()) - .collect(); - - let mean = - mean(&churns).ok_or_else(|| crate::error::Error::msg("failed to get mean churn value"))?; - let std_dev = std_dev(mean, &churns) - .ok_or_else(|| crate::error::Error::msg("failed to get churn standard deviation"))?; - - log::trace!("mean of churn scores [mean='{}']", mean); - log::trace!("standard deviation of churn scores [stddev='{}']", std_dev); - - if std_dev == 0.0 { - return Err(hc_error!("not enough commits to calculate churn")); - } - - for commit_churn_freq in &mut commit_churn_freqs { - commit_churn_freq.churn = (commit_churn_freq.churn - mean) / std_dev; - } - - log::info!("completed churn metric"); - - Ok(Arc::new(ChurnOutput { commit_churn_freqs })) -} - -#[derive(Debug)] -pub struct CommitChurn { - commit: Arc, - files_changed: i64, - lines_changed: i64, -} diff --git a/hipcheck/src/metric/entropy.rs b/hipcheck/src/metric/entropy.rs deleted file mode 100644 index 40a79278..00000000 --- a/hipcheck/src/metric/entropy.rs +++ /dev/null @@ -1,299 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - data::git::{Commit, CommitDiff, FileDiff}, - error::{Context as _, Result}, - hc_error, - metric::{ - math::{mean, std_dev}, - MetricProvider, - }, - TryAny, TryFilter, F64, -}; -use dashmap::DashMap; -use finl_unicode::grapheme_clusters::Graphemes; -use rayon::prelude::*; -use serde::Serialize; -use std::{collections::HashMap, iter::Iterator, sync::Arc}; -use unicode_normalization::UnicodeNormalization; - -/// Analyze a source to produce a set of entropy scores for its commits. -/// -/// # Algorithm -/// -/// The entropy algorithm works roughly as follows: -/// -/// 1. Get the list of all commits for a repository. -/// 2. Filter out the commits which do not contain any source-code changes. -/// 3. Calculate the frequencies of graphemes in the source-file patches (additions and deletions) -/// for each commit. -/// 4. Calculate the overall frequencies of each grapheme across all source-file patches from all -/// commits. -/// 5. Calculate the "commit entropy" score for each commit as the sum of each grapheme frequency -/// times the log base 2 of the grapheme frequency divided by the total frequency. -/// 6. Normalize these "commit entropy" scores into Z-scores. -/// -/// The idea here is that this metric captures the degree of textual randomness, but does _not_ -/// incorporate any positional information. It is solely based on the frequency of graphemes -/// in the patch text. -pub fn entropy_metric(db: &dyn MetricProvider) -> Result> { - log::debug!("running entropy metric"); - - // Calculate the grapheme frequencies for each commit which contains code. - let commit_freqs = db - .commit_diffs() - .context("failed to get commit diffs")? - .iter() - .try_filter(|cd| is_likely_source_file(cd, db)) - .map(|result| match result { - Ok(commit_diff) => grapheme_freqs(commit_diff, db), - Err(e) => Err(e), - }) - .collect::>>()?; - - // Calculate baseline grapheme frequencies across all commits which contain code. - let baseline_freqs = baseline_freqs(&commit_freqs); - - // Calculate the entropy of each commit which contains code. - let mut commit_entropies = commit_freqs - .iter() - .map(|commit_freq| commit_entropy(commit_freq, &baseline_freqs)) - .collect::>(); - - // Sort the commits by entropy score - // PANIC: It is safe to unwrap here, because the entropy scores will always be valid floating point numbers if we get to this point - commit_entropies.sort_by(|a, b| b.entropy.partial_cmp(&a.entropy).unwrap()); - - // Convert to Z-scores and return results. - let entropy_output = z_scores(commit_entropies) - .map(EntropyOutput::new) - .map(Arc::new)?; - - log::info!("completed entropy metric"); - - Ok(entropy_output) -} - -/// The final output for entropy metric, containing an entropy score for -/// every commit. -#[derive(Debug, Eq, PartialEq, Serialize)] -pub struct EntropyOutput { - /// The set of commit entropies. - pub commit_entropies: Vec, -} - -impl EntropyOutput { - /// Construct an `Output` from a set of commit entropies. - fn new(commit_entropies: Vec) -> EntropyOutput { - EntropyOutput { commit_entropies } - } -} - -/// The entropy of a single commit. -#[derive(Debug, Eq, PartialEq, Serialize, Clone)] -pub struct CommitEntropy { - /// The commit - pub commit: Arc, - /// The entropy score - pub entropy: F64, -} - -/// The grapheme frequencies of a single commit. -#[derive(Debug)] -struct CommitGraphemeFreq { - /// The commit. - commit: Arc, - /// The set of grapheme frequencies. - grapheme_freqs: Vec, -} - -/// The frequency of a single grapheme. -#[derive(Debug)] -struct GraphemeFreq { - /// The grapheme. - grapheme: String, - /// The frequency. - freq: f64, -} - -impl GraphemeFreq { - fn as_view(&self) -> GraphemeFreqView<'_> { - GraphemeFreqView { - grapheme: &self.grapheme, - freq: self.freq, - } - } -} - -/// A view of a grapheme frequency. -struct GraphemeFreqView<'gra> { - /// The view of the grapheme. - grapheme: &'gra str, - /// The freq (fine to copy) - freq: f64, -} - -/// Check if a commit diff is a likely source file. -fn is_likely_source_file( - commit_diff: &CommitDiff, - db: &dyn MetricProvider, -) -> crate::error::Result { - commit_diff - .diff - .file_diffs - .iter() - .try_any(|fd| db.is_likely_source_file(Arc::clone(&fd.file_name))) -} - -/// Calculate grapheme frequencies for each commit. -fn grapheme_freqs(commit_diff: &CommitDiff, db: &dyn MetricProvider) -> Result { - // #[cfg(feature = "print-timings")] - // let _0 = crate::benchmarking::print_scope_time!("grapheme_freqs"); - - // Dashmap (fast concurrent hashmap) to store counts for each grapheme. - let grapheme_table: DashMap = DashMap::new(); - - // Use this variable to track the total number of graphemes accross all patches in this commit diff. - let tgt_diffs: Result> = commit_diff - .diff - .file_diffs - .iter() - .filter_map(|file_diff| { - // Filter out any that are probably not source files, or are empty patches - let is_source: bool = match db.is_likely_source_file(Arc::clone(&file_diff.file_name)) { - Err(e) => { - return Some(Err(e)); - } - Ok(s) => s, - }; - if !is_source || file_diff.patch.is_empty() { - None - } else { - Some(Ok(file_diff)) - } - }) - .collect(); - // Use this variable to track the total number of graphemes accross all patches in this commit diff. - let total_graphemes: usize = tgt_diffs? - // Iterate over each line of each file in parallel - .par_iter() - .flat_map(|file_diff| file_diff.patch.par_lines()) - // Normalize each line. - // See https://en.wikipedia.org/wiki/Unicode_equivalence. - .map(|line: &str| line.chars().nfc().collect::()) - // Count the graphemes in each normalized line. - // Also update the graphemes table here. - // We'll sum these counts to get the total number of graphemes. - .map(|normalized_line: String| { - // Create an iterator over the graphemes in the line. - Graphemes::new(&normalized_line) - // Update the graphemes table. - .map(|grapheme: &str| { - // Use this if statement to avoid allocating a new string unless needed. - if let Some(mut count) = grapheme_table.get_mut(grapheme) { - *count += 1; - } else { - grapheme_table.insert(grapheme.to_owned(), 1); - } - }) - // get the grapheme count for this normalized line. - .count() - }) - // Aggregate the grapheme count across all lines of all files - .sum(); - - // Transform out table (dashmap) of graphemes and their frequencies into a list to return. - let grapheme_freqs = grapheme_table - // Iterate in parallel for performance. - .into_par_iter() - .map(|(grapheme, count)| GraphemeFreq { - grapheme, - freq: count as f64 / total_graphemes as f64, - }) - .collect(); - - // Return the collected list of graphemes and their frequencies for this commit diff. - Ok(CommitGraphemeFreq { - commit: Arc::clone(&commit_diff.commit), - grapheme_freqs, - }) -} - -/// Calculate baseline frequencies for each grapheme across all commits. -fn baseline_freqs(commit_freqs: &[CommitGraphemeFreq]) -> HashMap<&str, (f64, i64)> { - // PERFORMANCE: At the moment this function appears to be faster single-threaded. - // I tried switching out the hashmap with a Dashamp and switching the iterator to rayon, - // but the overhead is not worth it (against express we go from 3 milliseconds to 6). - // This may be worth revisiting if we prioritize projects with huge numbers of commits, but at the moment - // I will leave it be. - - #[cfg(feature = "print-timings")] - let _0 = crate::benchmarking::print_scope_time!("baseline_freqs"); - - let mut baseline: HashMap<&str, (f64, i64)> = HashMap::new(); - - commit_freqs - .iter() - .flat_map(|cf: &CommitGraphemeFreq| cf.grapheme_freqs.iter().map(GraphemeFreq::as_view)) - .for_each(|view: GraphemeFreqView| { - let entry = baseline.entry(view.grapheme).or_insert((0.0, 0)); - let cum_avg = entry.0; - let n = entry.1; - entry.0 = (view.freq + (n as f64) * cum_avg) / ((n + 1) as f64); - entry.1 = n + 1; - }); - - baseline -} - -/// Calculate commit entropy for each commit. -fn commit_entropy( - commit_freq: &CommitGraphemeFreq, - baseline: &HashMap<&str, (f64, i64)>, -) -> CommitEntropy { - let commit = Arc::clone(&commit_freq.commit); - let entropy = F64::new( - commit_freq - .grapheme_freqs - .iter() - .map(|grapheme_freq| { - // Get the freq for the current commit & grapheme. - let freq = grapheme_freq.freq; - - // Get the baseline freq for that grapheme across all commits. - let grapheme = grapheme_freq.grapheme.as_str(); - let baseline_freq = baseline.get(grapheme).unwrap().0; - - // Calculate the score for that grapheme. - freq * (freq / baseline_freq).log2() - }) - // Sum all individual grapheme scores together to get the commit's entropy. - .sum(), - ) - .unwrap(); - - CommitEntropy { commit, entropy } -} - -/// Convert entropy scores to Z-scores of the underlying metric. -fn z_scores(mut commit_entropies: Vec) -> Result> { - let entropies: Vec<_> = commit_entropies - .iter() - .map(|c| c.entropy.into_inner()) - .collect(); - - let mean = - mean(&entropies).ok_or_else(|| crate::error::Error::msg("failed to get mean entropy"))?; - let std_dev = std_dev(mean, &entropies) - .ok_or_else(|| crate::error::Error::msg("failed to get entropy standard deviation"))?; - - if std_dev == 0.0 { - return Err(hc_error!("not enough commits to calculate entropy")); - } - - for commit_entropy in &mut commit_entropies { - commit_entropy.entropy = (commit_entropy.entropy - mean) / std_dev; - } - - Ok(commit_entropies) -} diff --git a/hipcheck/src/metric/math.rs b/hipcheck/src/metric/math.rs index d14e6ed7..cdc649ac 100644 --- a/hipcheck/src/metric/math.rs +++ b/hipcheck/src/metric/math.rs @@ -1,34 +1 @@ // SPDX-License-Identifier: Apache-2.0 - -/// Calculate the arithmetic mean for a set of floats. Returns an option to account -/// for the possibility of dividing by zero. -pub fn mean(data: &[f64]) -> Option { - // Do not use rayon's parallel iter/sum here due to the non-associativity of floating point numbers/math. - // See: https://en.wikipedia.org/wiki/Associative_property#Nonassociativity_of_floating_point_calculation. - let sum = data.iter().sum::(); - let count = data.len(); - - match count { - positive if positive > 0 => Some(sum / count as f64), - _ => None, - } -} - -/// Calculate the standard deviation for a set of floats. Returns an option to -/// account for the possibility of dividing by zero. -pub fn std_dev(mean: f64, data: &[f64]) -> Option { - match (mean, data.len()) { - (mean, count) if count > 0 => { - let variance = - data.iter() - .map(|value| { - let diff = mean - *value; - diff * diff - }) - .sum::() / count as f64; - - Some(variance.sqrt()) - } - _ => None, - } -} diff --git a/hipcheck/src/metric/mod.rs b/hipcheck/src/metric/mod.rs index 24d3146f..3e51b02a 100644 --- a/hipcheck/src/metric/mod.rs +++ b/hipcheck/src/metric/mod.rs @@ -1,24 +1,21 @@ // SPDX-License-Identifier: Apache-2.0 pub mod binary_detector; -pub mod churn; pub mod commit_trust; pub mod contributor_trust; -pub mod entropy; pub mod fuzz; pub mod identity; pub mod linguist; mod math; -pub mod typo; use crate::{ config::{AttacksConfigQuery, CommitConfigQuery}, data::{git::GitProvider, DependenciesProvider, FuzzProvider, PullRequestReviewProvider}, error::Result, metric::{ - binary_detector::BinaryFile, churn::ChurnOutput, commit_trust::CommitTrustOutput, - contributor_trust::ContributorTrustOutput, entropy::EntropyOutput, fuzz::FuzzOutput, - identity::IdentityOutput, linguist::Linguist, typo::TypoOutput, + binary_detector::BinaryFile, commit_trust::CommitTrustOutput, + contributor_trust::ContributorTrustOutput, fuzz::FuzzOutput, + identity::IdentityOutput, linguist::Linguist, }, }; use std::sync::Arc; @@ -35,10 +32,6 @@ pub trait MetricProvider: + FuzzProvider + PullRequestReviewProvider { - /// Returns result of churn metric - #[salsa::invoke(churn::churn_metric)] - fn churn_metric(&self) -> Result>; - /// Returns result of contributor trust metric #[salsa::invoke(commit_trust::commit_trust_metric)] fn commit_trust_metric(&self) -> Result>; @@ -47,10 +40,6 @@ pub trait MetricProvider: #[salsa::invoke(contributor_trust::contributor_trust_metric)] fn contributor_trust_metric(&self) -> Result>; - /// Returns result of entropy metric - #[salsa::invoke(entropy::entropy_metric)] - fn entropy_metric(&self) -> Result>; - /// Returns result of identity metric #[salsa::invoke(identity::identity_metric)] fn identity_metric(&self) -> Result>; @@ -58,8 +47,4 @@ pub trait MetricProvider: /// Returns result of fuzz metric #[salsa::invoke(fuzz::fuzz_metric)] fn fuzz_metric(&self) -> Result>; - - /// Returns result of typo metric - #[salsa::invoke(typo::typo_metric)] - fn typo_metric(&self) -> Result>; } \ No newline at end of file diff --git a/hipcheck/src/metric/typo.rs b/hipcheck/src/metric/typo.rs deleted file mode 100644 index 5a36b8e0..00000000 --- a/hipcheck/src/metric/typo.rs +++ /dev/null @@ -1,782 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - data::{Dependencies, Lang}, - error::{Context as _, Result}, - metric::MetricProvider, - util::fs as file, -}; -use maplit::hashmap; -use serde::{Deserialize, Serialize}; -use std::{ - cmp::Ordering, collections::HashMap, convert::AsRef, fmt, fmt::Display, path::Path, str, - sync::Arc, -}; - -#[derive(Debug, Eq, PartialEq, Serialize)] -pub struct TypoOutput { - pub typos: Vec, -} - -#[derive(Debug, Clone, Eq, PartialEq, Serialize)] -pub struct TypoDep { - pub dependency: Arc, - pub typo: Typo, -} - -pub fn typo_metric(db: &dyn MetricProvider) -> Result> { - log::debug!("running typo metric"); - - let typo_file = TypoFile::load_from(&db.typo_file()?).context("failed to load typo file")?; - - let dependencies = db.dependencies().context("failed to get dependencies")?; - - let typo_output = match dependencies.language { - Lang::JavaScript => typos_for_javascript(&typo_file, dependencies), - Lang::Unknown => Err(crate::error::Error::msg( - "failed to identify a known language", - )), - }?; - - log::info!("completed typo metric"); - - Ok(typo_output) -} - -fn typos_for_javascript( - typo_file: &TypoFile, - dependencies: Arc, -) -> Result> { - let mut typos = Vec::new(); - - for legit_name in &typo_file.languages.javascript { - let fuzzer = NameFuzzer::new(legit_name); - - for dependency in &dependencies.deps { - for typo in fuzzer.fuzz(dependency) { - typos.push(TypoDep { - dependency: Arc::clone(dependency), - typo: typo.clone(), - }) - } - } - } - - Ok(Arc::new(TypoOutput { typos })) -} - -#[derive(Debug, Deserialize)] -struct TypoFile { - languages: Languages, -} - -#[derive(Debug, Deserialize)] -struct Languages { - javascript: Vec, -} - -impl TypoFile { - fn load_from(typo_path: &Path) -> Result { - file::exists(typo_path).context("typo file does not exist")?; - let typo_file = file::read_toml(typo_path).context("failed to open typo file")?; - - Ok(typo_file) - } -} - -#[derive(Debug, Clone)] -pub struct NameFuzzer<'t> { - // A map of strings which may be typos to the notes for what they may be - // typos of. Fuzzing then only needs to hash the string and look it up in the - // typo hash map. - typos: HashMap>, - // The list of original names. - name: &'t str, -} - -impl<'t> NameFuzzer<'t> { - /// Construct a new NameFuzzer for the given corpus. - pub fn new(name: &'t str) -> NameFuzzer<'t> { - let typos = { - let keyboards = vec![ - KeyboardLayout::qwerty(), - KeyboardLayout::qwertz(), - KeyboardLayout::azerty(), - ]; - - let homoglyphs = vec![Homoglyphs::ascii()]; - - get_typos(name, &keyboards, &homoglyphs).iter().fold( - HashMap::new(), - |mut typos: HashMap>, typo| { - typos - .entry(typo.to_str().to_owned()) - .and_modify(|val| val.push(typo.clone())) - .or_insert_with(|| vec![typo.clone()]); - - typos - }, - ) - }; - - NameFuzzer { typos, name } - } - - /// Check the name against the set of known typos for the corpus to generate - /// a list of possible typos. - /// - /// Returns an empty slice if no typos were found. - pub fn fuzz(&self, name: &str) -> &[Typo] { - if self.name == name { - return &[]; - } - - self.typos.get(name).map(AsRef::as_ref).unwrap_or(&[]) - } -} - -#[inline] -fn get_typos(name: &str, keyboards: &[KeyboardLayout], homoglyphs: &[Homoglyphs]) -> Vec { - let mut results = Vec::new(); - - // Get all the kinds of typos. - get_addition_typos(&mut results, name); - get_bitsquatting_typos(&mut results, name); - get_hyphenation_typos(&mut results, name); - get_insertion_typos(&mut results, name, keyboards); - get_omission_typos(&mut results, name); - get_repetition_typos(&mut results, name); - get_replacement_typos(&mut results, name, keyboards); - get_transposition_typos(&mut results, name); - get_vowel_swap_typos(&mut results, name); - get_pluralization_typos(&mut results, name); - get_homoglyph_typos(&mut results, name, homoglyphs); - - // The methods above might generate duplicates. This removes them. - // - // Sorting is done with sort() rather than sort_unstable() to ensure that the - // order of the different kinds of typos is preserved, to make testing easier. - // - // Given that a fuzzer should only be constructed once for a corpus, the cost - // difference of this is expected to be negligible. - results.sort(); - results.dedup(); - - results -} - -#[inline] -fn get_addition_typos(results: &mut Vec, name: &str) { - results.extend( - (b'_'..b'z') - .map(char::from) - .map(|c| format!("{}{}", name, c)) - .filter(|t| t != name) - .map(Typo::addition), - ); -} - -#[inline] -fn get_bitsquatting_typos(results: &mut Vec, name: &str) { - results.extend( - [1, 2, 4, 8, 16, 32, 64, 128] - .iter() - .flat_map(|mask| { - name.bytes().enumerate().map(move |(index, byte)| { - let c = mask ^ byte; - - // If the corrupted byte is within the proper ASCII range, then - // produce a new string including the corrupted byte. - if (c == b'-') || (c == b'_') || c.is_ascii_digit() || c.is_ascii_lowercase() { - let mut corrupted = name.to_owned(); - - // We have already ensured the new byte is a valid ASCII byte, so this - // use of unsafe is alright. - let corrupted_bytes = unsafe { corrupted.as_bytes_mut() }; - corrupted_bytes[index] = c; - - Some(corrupted) - } else { - None - } - }) - }) - .flatten() - .filter(|t| t != name) - .map(Typo::bitsquatting), - ); -} - -#[inline] -fn get_hyphenation_typos(results: &mut Vec, name: &str) { - results.extend( - name.chars() - .enumerate() - .map(|(index, _)| { - let mut corrupted = name.to_owned(); - corrupted.insert(index, '-'); - corrupted - }) - .filter(|t| t != name) - .map(Typo::hyphenation), - ); -} - -#[inline] -fn get_insertion_typos(results: &mut Vec, name: &str, keyboards: &[KeyboardLayout]) { - results.extend( - keyboards - .iter() - .flat_map(|keyboard| { - name.chars().enumerate().flat_map(move |(index, c)| { - let mut corruptions = Vec::new(); - - if keyboard.neighbors().contains_key(&c) { - for neighbor in &keyboard.neighbors()[&c] { - // Before the current character. - let mut corrupted_before = name.to_owned(); - corrupted_before.insert(index, *neighbor); - corruptions.push(corrupted_before); - - // After the current character. - let mut corrupted_after = name.to_owned(); - corrupted_after.insert(index + 1, *neighbor); - corruptions.push(corrupted_after); - } - } - - corruptions - }) - }) - .filter(|t| t != name) - .map(Typo::insertion), - ); -} - -#[inline] -fn get_omission_typos(results: &mut Vec, name: &str) { - results.extend( - name.chars() - .enumerate() - .map(|(index, _)| { - let mut corrupted = name.to_owned(); - corrupted.remove(index); - corrupted - }) - .filter(|t| t != name) - .map(Typo::omission), - ); -} - -#[inline] -fn get_repetition_typos(results: &mut Vec, name: &str) { - results.extend( - name.chars() - .enumerate() - .map(|(index, c)| { - let mut corrupted = name.to_owned(); - corrupted.insert(index, c); - corrupted - }) - .filter(|t| t != name) - .map(Typo::repetition), - ); -} - -#[inline] -fn get_replacement_typos(results: &mut Vec, name: &str, keyboards: &[KeyboardLayout]) { - results.extend( - keyboards - .iter() - .flat_map(|keyboard| { - name.chars().enumerate().flat_map(move |(index, c)| { - let mut corruptions = Vec::new(); - - if keyboard.neighbors().contains_key(&c) { - for neighbor in &keyboard.neighbors()[&c] { - let mut corrupted = name.to_owned(); - corrupted.replace_range(index..=index, &neighbor.to_string()); - corruptions.push(corrupted); - } - } - - corruptions - }) - }) - .filter(|t| t != name) - .map(Typo::replacement), - ); -} - -#[inline] -fn get_transposition_typos(results: &mut Vec, name: &str) { - results.extend({ - // Credit for this code to shepmaster on Stack Overflow. - // - // https://codereview.stackexchange.com/questions/155294/transposing-characters-in-a-string - let bytes = name.as_bytes(); - - (1..bytes.len()) - .map(move |i| { - let mut transpose = bytes.to_owned(); - transpose.swap(i - 1, i); - String::from_utf8(transpose).expect("Invalid UTF-8") - }) - .filter(|t| t != name) - .map(Typo::transposition) - }); -} - -#[inline] -fn get_vowel_swap_typos(results: &mut Vec, name: &str) { - results.extend( - name.chars() - .enumerate() - .flat_map(|(index, c)| { - let vowels = ['a', 'e', 'i', 'o', 'u']; - let mut corruptions = Vec::new(); - - for vowel in &vowels { - if vowels.contains(&c) { - let mut corrupted = name.to_owned(); - corrupted.replace_range(index..=index, &vowel.to_string()); - corruptions.push(corrupted); - } - } - - corruptions - }) - .filter(|t| t != name) - .map(Typo::vowel_swap), - ); -} - -#[inline] -fn get_pluralization_typos(results: &mut Vec, name: &str) { - results.extend( - name.chars() - .enumerate() - .map(|(index, _c)| { - let mut corrupted = name.to_owned(); - corrupted.insert(index + 1, 's'); - corrupted - }) - .filter(|t| t != name) - .map(Typo::pluralization), - ); -} - -#[inline] -fn get_homoglyph_typos(results: &mut Vec, name: &str, homoglyphs: &[Homoglyphs]) { - results.extend( - homoglyphs - .iter() - .flat_map(|homoglph| { - name.chars().enumerate().flat_map(move |(index, c)| { - let mut corruptions = Vec::new(); - - if homoglph.glyphs().contains_key(&c) { - for glyph in &homoglph.glyphs()[&c] { - let mut corrupted = name.to_owned(); - corrupted.replace_range(index..=index, &glyph.to_string()); - corruptions.push(corrupted); - } - } - - corruptions - }) - }) - .filter(|t| t != name) - .map(Typo::homoglyph), - ); -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize)] -pub struct Typo { - kind: TypoKind, - typo: String, -} - -impl Typo { - #[inline] - pub fn addition(typo: String) -> Typo { - Typo { - kind: TypoKind::Addition, - typo, - } - } - - #[inline] - pub fn bitsquatting(typo: String) -> Typo { - Typo { - kind: TypoKind::Bitsquatting, - typo, - } - } - - #[inline] - pub fn hyphenation(typo: String) -> Typo { - Typo { - kind: TypoKind::Hyphenation, - typo, - } - } - - #[inline] - pub fn insertion(typo: String) -> Typo { - Typo { - kind: TypoKind::Insertion, - typo, - } - } - - #[inline] - pub fn omission(typo: String) -> Typo { - Typo { - kind: TypoKind::Omission, - typo, - } - } - - #[inline] - pub fn repetition(typo: String) -> Typo { - Typo { - kind: TypoKind::Repetition, - typo, - } - } - - #[inline] - pub fn replacement(typo: String) -> Typo { - Typo { - kind: TypoKind::Replacement, - typo, - } - } - - #[inline] - pub fn transposition(typo: String) -> Typo { - Typo { - kind: TypoKind::Transposition, - typo, - } - } - - #[inline] - pub fn vowel_swap(typo: String) -> Typo { - Typo { - kind: TypoKind::VowelSwap, - typo, - } - } - - #[inline] - pub fn pluralization(typo: String) -> Typo { - Typo { - kind: TypoKind::Pluralization, - typo, - } - } - - #[inline] - pub fn homoglyph(typo: String) -> Typo { - Typo { - kind: TypoKind::Homoglyph, - typo, - } - } - - #[inline] - pub fn to_str(&self) -> &str { - &self.typo - } -} - -impl PartialOrd for Typo { - #[inline] - fn partial_cmp(&self, other: &Typo) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Typo { - #[inline] - fn cmp(&self, other: &Typo) -> Ordering { - self.typo.cmp(&other.typo) - } -} - -impl PartialEq<&Typo> for Typo { - #[inline] - fn eq(&self, other: &&Typo) -> bool { - self.eq(*other) - } -} - -impl PartialEq for &Typo { - #[inline] - fn eq(&self, other: &Typo) -> bool { - (*self).eq(other) - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize)] -enum TypoKind { - Addition, - Bitsquatting, - Hyphenation, - Insertion, - Omission, - Repetition, - Replacement, - Transposition, - VowelSwap, - Pluralization, - Homoglyph, -} - -impl Display for TypoKind { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - TypoKind::Addition => write!(f, "addition"), - TypoKind::Bitsquatting => write!(f, "bitsquatting"), - TypoKind::Hyphenation => write!(f, "hyphenation"), - TypoKind::Insertion => write!(f, "insertion"), - TypoKind::Omission => write!(f, "omission"), - TypoKind::Repetition => write!(f, "repetition"), - TypoKind::Replacement => write!(f, "replacement"), - TypoKind::Transposition => write!(f, "transposition"), - TypoKind::VowelSwap => write!(f, "vowel swap"), - TypoKind::Pluralization => write!(f, "pluralization"), - TypoKind::Homoglyph => write!(f, "homoglyph"), - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Homoglyphs(HashMap>); - -impl Homoglyphs { - #[inline] - pub fn new(homoglyphs: HashMap>) -> Homoglyphs { - Homoglyphs(homoglyphs) - } - - #[inline] - pub fn ascii() -> Homoglyphs { - Homoglyphs::new(hashmap! { - 'O' => vec!['0'], - '0' => vec!['O'], - 'l' => vec!['I'], - 'I' => vec!['l'], - }) - } - - #[inline] - pub fn glyphs(&self) -> &HashMap> { - &self.0 - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct KeyboardLayout { - neighbors: HashMap>, -} - -impl KeyboardLayout { - #[inline] - pub fn new(neighbors: HashMap>) -> KeyboardLayout { - KeyboardLayout { neighbors } - } - - #[inline] - pub fn qwerty() -> KeyboardLayout { - KeyboardLayout::new(hashmap! { - '1' => vec!['2', 'q'], - '2' => vec!['3', 'w', 'q', '1'], - '3' => vec!['4', 'e', 'w', '2'], - '4' => vec!['5', 'r', 'e', '3'], - '5' => vec!['6', 't', 'r', '4'], - '6' => vec!['7', 'y', 't', '5'], - '7' => vec!['8', 'u', 'y', '6'], - '8' => vec!['9', 'i', 'u', '7'], - '9' => vec!['0', 'o', 'i', '8'], - '0' => vec!['p', 'o', '9'], - 'q' => vec!['1', '2', 'w', 'a'], - 'w' => vec!['3', 'e', 's', 'a', 'q', '2'], - 'e' => vec!['4', 'r', 'd', 's', 'w', '3'], - 'r' => vec!['5', 't', 'f', 'd', 'e', '4'], - 't' => vec!['6', 'y', 'g', 'f', 'r', '5'], - 'y' => vec!['7', 'u', 'h', 'g', 't', '6'], - 'u' => vec!['8', 'i', 'j', 'h', 'y', '7'], - 'i' => vec!['9', 'o', 'k', 'j', 'u', '8'], - 'o' => vec!['0', 'p', 'l', 'k', 'i', '9'], - 'p' => vec!['l', 'o', '0'], - 'a' => vec!['q', 'w', 's', 'z'], - 's' => vec!['e', 'd', 'x', 'z', 'a', 'w'], - 'd' => vec!['r', 'f', 'c', 'x', 's', 'e'], - 'f' => vec!['t', 'g', 'v', 'c', 'd', 'r'], - 'g' => vec!['y', 'h', 'b', 'v', 'f', 't'], - 'h' => vec!['u', 'j', 'n', 'b', 'g', 'y'], - 'j' => vec!['i', 'k', 'm', 'n', 'h', 'u'], - 'k' => vec!['o', 'l', 'm', 'j', 'i'], - 'l' => vec!['k', 'o', 'p'], - 'z' => vec!['a', 's', 'x'], - 'x' => vec!['z', 's', 'd', 'c'], - 'c' => vec!['x', 'd', 'f', 'v'], - 'v' => vec!['c', 'f', 'g', 'b'], - 'b' => vec!['v', 'g', 'h', 'n'], - 'n' => vec!['b', 'h', 'j', 'm'], - 'm' => vec!['n', 'j', 'k'], - }) - } - - #[inline] - pub fn qwertz() -> KeyboardLayout { - KeyboardLayout::new(hashmap! { - '1' => vec!['2', 'q'], - '2' => vec!['3', 'w', 'q', '1'], - '3' => vec!['4', 'e', 'w', '2'], - '4' => vec!['5', 'r', 'e', '3'], - '5' => vec!['6', 't', 'r', '4'], - '6' => vec!['7', 'z', 't', '5'], - '7' => vec!['8', 'u', 'z', '6'], - '8' => vec!['9', 'i', 'u', '7'], - '9' => vec!['0', 'o', 'i', '8'], - '0' => vec!['p', 'o', '9'], - 'q' => vec!['1', '2', 'w', 'a'], - 'w' => vec!['3', 'e', 's', 'a', 'q', '2'], - 'e' => vec!['4', 'r', 'd', 's', 'w', '3'], - 'r' => vec!['5', 't', 'f', 'd', 'e', '4'], - 't' => vec!['6', 'z', 'g', 'f', 'r', '5'], - 'z' => vec!['7', 'u', 'h', 'g', 't', '6'], - 'u' => vec!['8', 'i', 'j', 'h', 'z', '7'], - 'i' => vec!['9', 'o', 'k', 'j', 'u', '8'], - 'o' => vec!['0', 'p', 'l', 'k', 'i', '9'], - 'p' => vec!['l', 'o', '0'], - 'a' => vec!['q', 'w', 's', 'y'], - 's' => vec!['e', 'd', 'x', 'y', 'a', 'w'], - 'd' => vec!['r', 'f', 'c', 'x', 's', 'e'], - 'f' => vec!['t', 'g', 'v', 'c', 'd', 'r'], - 'g' => vec!['z', 'h', 'b', 'v', 'f', 't'], - 'h' => vec!['u', 'j', 'n', 'b', 'g', 'z'], - 'j' => vec!['i', 'k', 'm', 'n', 'h', 'u'], - 'k' => vec!['o', 'l', 'm', 'j', 'i'], - 'l' => vec!['k', 'o', 'p'], - 'y' => vec!['a', 's', 'x'], - 'x' => vec!['y', 's', 'd', 'c'], - 'c' => vec!['x', 'd', 'f', 'v'], - 'v' => vec!['c', 'f', 'g', 'b'], - 'b' => vec!['v', 'g', 'h', 'n'], - 'n' => vec!['b', 'h', 'j', 'm'], - 'm' => vec!['n', 'j', 'k'], - }) - } - - #[inline] - pub fn azerty() -> KeyboardLayout { - KeyboardLayout::new(hashmap! { - '1' => vec!['2', 'a'], - '2' => vec!['3', 'z', 'a', '1'], - '3' => vec!['4', 'e', 'z', '2'], - '4' => vec!['5', 'r', 'e', '3'], - '5' => vec!['6', 't', 'r', '4'], - '6' => vec!['7', 'y', 't', '5'], - '7' => vec!['8', 'u', 'y', '6'], - '8' => vec!['9', 'i', 'u', '7'], - '9' => vec!['0', 'o', 'i', '8'], - '0' => vec!['p', 'o', '9'], - 'a' => vec!['2', 'z', 'q', '1'], - 'z' => vec!['3', 'e', 's', 'q', 'a', '2'], - 'e' => vec!['4', 'r', 'd', 's', 'z', '3'], - 'r' => vec!['5', 't', 'f', 'd', 'e', '4'], - 't' => vec!['6', 'y', 'g', 'f', 'r', '5'], - 'y' => vec!['7', 'u', 'h', 'g', 't', '6'], - 'u' => vec!['8', 'i', 'j', 'h', 'y', '7'], - 'i' => vec!['9', 'o', 'k', 'j', 'u', '8'], - 'o' => vec!['0', 'p', 'l', 'k', 'i', '9'], - 'p' => vec!['l', 'o', '0', 'm'], - 'q' => vec!['z', 's', 'w', 'a'], - 's' => vec!['e', 'd', 'x', 'w', 'q', 'z'], - 'd' => vec!['r', 'f', 'c', 'x', 's', 'e'], - 'f' => vec!['t', 'g', 'v', 'c', 'd', 'r'], - 'g' => vec!['y', 'h', 'b', 'v', 'f', 't'], - 'h' => vec!['u', 'j', 'n', 'b', 'g', 'y'], - 'j' => vec!['i', 'k', 'n', 'h', 'u'], - 'k' => vec!['o', 'l', 'j', 'i'], - 'l' => vec!['k', 'o', 'p', 'm'], - 'm' => vec!['l', 'p'], - 'w' => vec!['s', 'x', 'q'], - 'x' => vec!['w', 's', 'd', 'c'], - 'c' => vec!['x', 'd', 'f', 'v'], - 'v' => vec!['c', 'f', 'g', 'b'], - 'b' => vec!['v', 'g', 'h', 'n'], - 'n' => vec!['b', 'h', 'j'], - }) - } - - #[inline] - pub fn neighbors(&self) -> &HashMap> { - &self.neighbors - } -} - -#[cfg(test)] -mod test { - use super::{NameFuzzer, Typo}; - - macro_rules! test_typos { - ( from: $name:ident, to: $to:literal, expected: [ $( $expected:ident ),* ] ) => { - let fuzzer = NameFuzzer::new(&$name); - let result = fuzzer.fuzz($to); - - let expected = vec![ $( - Typo::$expected($to.into()), - )* ]; - - assert_eq!(result, &expected[..]); - }; - } - - const NAME: &str = "hello"; - - #[test] - fn fuzz_hello_to_hallo() { - test_typos! { from: NAME, to: "hallo", expected: [bitsquatting, vowel_swap] } - } - - #[test] - fn fuzz_hello_to_helo() { - test_typos! { from: NAME, to: "helo", expected: [omission] } - } - - #[test] - fn fuzz_hello_to_helllo() { - test_typos! { from: NAME, to: "helllo", expected: [insertion, repetition] } - } - - #[test] - fn fuzz_hello_to_hrllo() { - test_typos! { from: NAME, to: "hrllo", expected: [replacement] } - } - - #[test] - fn fuzz_hello_to_hlelo() { - test_typos! { from: NAME, to: "hlelo", expected: [transposition] } - } - - #[test] - fn fuzz_hello_to_hellop() { - test_typos! { from: NAME, to: "hellop", expected: [addition, insertion] } - } - - #[test] - fn fuzz_hello_to_h_ello() { - test_typos! { from: NAME, to: "h-ello", expected: [hyphenation] } - } - - #[test] - fn fuzz_hello_to_hellos() { - test_typos! { from: NAME, to: "hellos", expected: [addition, pluralization] } - } -} diff --git a/hipcheck/src/util/iter.rs b/hipcheck/src/util/iter.rs index beb3e6fb..ac9d27b1 100644 --- a/hipcheck/src/util/iter.rs +++ b/hipcheck/src/util/iter.rs @@ -2,80 +2,6 @@ //! Iterator extension traits. -/// A fallible analogue of the `Iterator::any` method -pub trait TryAny: Iterator { - fn try_any(&mut self, mut f: F) -> Result - where - F: FnMut(::Item) -> Result, - { - for t in self { - match f(t) { - Ok(false) => continue, - result => return result, - } - } - - Ok(false) - } -} - -impl TryAny for I {} - -/// Represents an iterator and a fallible criterion for filtering it -pub struct FallibleFilter -where - I: Iterator, - P: FnMut(&::Item) -> Result, -{ - iterator: I, - predicate: P, -} - -impl FallibleFilter -where - I: Iterator, - P: FnMut(&::Item) -> Result, -{ - fn new(iterator: I, predicate: P) -> Self { - FallibleFilter { - iterator, - predicate, - } - } -} - -impl Iterator for FallibleFilter -where - I: Iterator, - P: FnMut(&::Item) -> Result, -{ - type Item = Result<::Item, E>; - - fn next(&mut self) -> Option { - if let Some(t) = self.iterator.next() { - match (self.predicate)(&t) { - Ok(true) => Some(Ok(t)), - Ok(false) => self.next(), - Err(e) => Some(Err(e)), - } - } else { - None - } - } -} - -/// Apply a fallible filter to an Iterator, returning the elements -/// selected by the filter and any errors that occur. -pub trait TryFilter: Sized + Iterator { - fn try_filter(self, predicate: P) -> FallibleFilter - where - P: FnMut(&::Item) -> Result, - { - FallibleFilter::new(self, predicate) - } -} - -impl TryFilter for I {} #[cfg(test)] mod tests {