diff --git a/ngrams/Cargo.toml b/ngrams/Cargo.toml index 9666ca4..4a38935 100644 --- a/ngrams/Cargo.toml +++ b/ngrams/Cargo.toml @@ -23,6 +23,7 @@ pretty_assertions = "1.4.0" serde = { version = "1.0.203", features = ["derive"] } ciborium = "0.2" lazy_static = "1.4.0" +derivative = "^2.2" [dependencies.seqraph] diff --git a/ngrams/src/graph/labelling/frequency.rs b/ngrams/src/graph/labelling/frequency.rs index 387fc09..5ec1f9f 100644 --- a/ngrams/src/graph/labelling/frequency.rs +++ b/ngrams/src/graph/labelling/frequency.rs @@ -16,7 +16,7 @@ use crate::graph::{ TopDown, TraversalDirection, }, pass::TraversalPass, queue::{Queue, SortedQueue} - }, utils::cover::FrequencyCover, vocabulary::{ + }, utils::cover::frequency::FrequencyCover, vocabulary::{ entry::{ HasVertexEntries, VertexCtx, @@ -45,12 +45,10 @@ pub struct FrequencyCtx<'b> #[deref] #[deref_mut] pub ctx: &'b mut LabellingCtx, - #[new(default)] - visited: ::Visited, } + impl TraversalPass for FrequencyCtx<'_> { - type Visited = (); type Node = VertexKey; type NextNode = NGramId; type Queue = SortedQueue; @@ -67,9 +65,6 @@ impl TraversalPass for FrequencyCtx<'_> self.labels.extend(start.iter().map(HasVertexKey::vertex_key)); queue } - fn visited(&mut self) -> &mut Self::Visited { - &mut self.visited - } fn on_node( &mut self, node: &Self::Node, diff --git a/ngrams/src/graph/labelling/wrapper.rs b/ngrams/src/graph/labelling/wrapper.rs index 3614098..eae50a1 100644 --- a/ngrams/src/graph/labelling/wrapper.rs +++ b/ngrams/src/graph/labelling/wrapper.rs @@ -19,8 +19,8 @@ use crate::graph::{ TopDown, TraversalDirection, }, - pass::TraversalPass, queue::{LayeredQueue, Queue}, - }, utils::tree::ChildTree, vocabulary::{ + pass::TraversalPass, queue::{LayeredQueue, Queue}, visited::Visited, + }, utils::cover::ChildCover, vocabulary::{ entry::VertexCtx, NGramId, ProcessStatus, Vocabulary, @@ -44,22 +44,25 @@ pub struct WrapperCtx<'b> #[deref_mut] ctx: &'b mut LabellingCtx, #[new(default)] - visited: ::Visited, + visited: ::Collection, } // - run bottom up (all smaller nodes need to be fully labelled) // - for each node x: // - run top down to find the largest frequent children to cover whole range // - label node x if there are multiple overlapping labelled child nodes +impl Visited for WrapperCtx<'_> +{ + type Collection = HashSet<::Node>; + fn visited<'t>(&'t mut self) -> &'t mut ::Collection { + &mut self.visited + } +} impl TraversalPass for WrapperCtx<'_> { - type Visited = HashSet; type Node = VertexKey; type NextNode = VertexKey; type Queue = LayeredQueue; - fn visited(&mut self) -> &mut Self::Visited { - &mut self.visited - } fn start_queue(&mut self) -> Self::Queue { BottomUp::starting_nodes(&self.vocab).into_iter() .map(|ng| ng.key).collect() @@ -77,7 +80,7 @@ impl TraversalPass for WrapperCtx<'_> if !self.labels.contains(node) { - let tree = ChildTree::from_entry(self.ctx, &entry); + let tree = ChildCover::from_key(self.ctx, entry.vertex_key()); if tree.any_intersect() { let key = entry.data.vertex_key(); diff --git a/ngrams/src/graph/partitions/container/mod.rs b/ngrams/src/graph/partitions/container/mod.rs index 31b32c2..21991e9 100644 --- a/ngrams/src/graph/partitions/container/mod.rs +++ b/ngrams/src/graph/partitions/container/mod.rs @@ -54,7 +54,7 @@ use crate::graph::{ }, traversal::direction::{ TopDown, TraversalDirection, - }, utils::tree::ChildTree, vocabulary::{ + }, utils::cover::ChildCover, vocabulary::{ entry::{ HasVertexEntries, VertexCtx, @@ -89,15 +89,16 @@ pub struct PartitionContainer } impl PartitionContainer { - pub fn from_entry( + pub fn from_ngram( ctx: &PartitionsCtx<'_>, - entry: &VertexCtx, + ngram: NGramId, ) -> Self { // find all largest children - let tree = ChildTree::from_entry(ctx, entry); + let tree = ChildCover::from_key(ctx, ngram.vertex_key()); + assert!( - match entry.width() + match ngram.width() { 1 => tree.is_empty(), _ => !tree.is_empty(), @@ -107,7 +108,7 @@ impl PartitionContainer // build container with gaps //let next = tree.iter().map(|(_, c)| c.vertex_index()).collect(); let ctx = NodePartitionCtx::new( - NGramId::new(entry.data.vertex_key(), entry.data.width()), + ngram, ctx, ); Self::from_child_list(&ctx, tree) diff --git a/ngrams/src/graph/partitions/mod.rs b/ngrams/src/graph/partitions/mod.rs index f8c554d..48811e8 100644 --- a/ngrams/src/graph/partitions/mod.rs +++ b/ngrams/src/graph/partitions/mod.rs @@ -19,7 +19,7 @@ use crate::graph::{ }, pass::TraversalPass, queue::Queue, }, - utils::tree::ChildTree, + utils::cover::ChildCover, vocabulary::{ entry::{ HasVertexEntries, @@ -54,7 +54,7 @@ use seqraph::{ HashSet, }; -use super::{traversal::queue::LayeredQueue, vocabulary::Vocabulary}; +use super::{traversal::{queue::LayeredQueue, visited::Visited}, vocabulary::Vocabulary}; // - run top down (smaller nodes to label need to be found) // - for each node x: @@ -78,7 +78,7 @@ pub struct PartitionsCtx<'b> #[deref_mut] pub ctx: &'b mut LabellingCtx, pub graph: Hypergraph, - visited: ::Visited, + visited: ::Collection, } impl<'b> From<&'b mut LabellingCtx> for PartitionsCtx<'b> { @@ -91,15 +91,18 @@ impl<'b> From<&'b mut LabellingCtx> for PartitionsCtx<'b> { } } } +impl Visited for PartitionsCtx<'_> +{ + type Collection = HashSet<::Node>; + fn visited<'t>(&'t mut self) -> &'t mut ::Collection { + &mut self.visited + } +} impl TraversalPass for PartitionsCtx<'_> { - type Visited = HashSet; type Node = NGramId; type NextNode = NGramId; type Queue = LayeredQueue; - fn visited(&mut self) -> &mut Self::Visited { - &mut self.visited - } fn start_queue(&mut self) -> Self::Queue { let queue = Self::Queue::from_iter( TopDown::starting_nodes(&self.vocab) @@ -119,8 +122,8 @@ impl TraversalPass for PartitionsCtx<'_> node: &NGramId, ) -> Option> { + let container = PartitionContainer::from_ngram(self, *node); let entry = self.vocab.get_vertex(node).unwrap(); - let container = PartitionContainer::from_entry(self, &entry); let pids: Vec<_> = std::iter::repeat_n((), container.len()) .map(|_| PatternId::default()) diff --git a/ngrams/src/graph/traversal/pass.rs b/ngrams/src/graph/traversal/pass.rs index 7d8845e..f4e1780 100644 --- a/ngrams/src/graph/traversal/pass.rs +++ b/ngrams/src/graph/traversal/pass.rs @@ -18,20 +18,18 @@ use seqraph::graph::vertex::{ VertexIndex, }; -use super::{queue::Queue, visited::Visited}; +use super::{queue::Queue, visited::{Visited, VisitorCollection}}; pub trait PassNode: Eq + PartialEq + Debug + Clone + Hash {} impl PassNode for N {} pub trait TraversalPass : Sized { type Node: PassNode + Copy; type NextNode: PassNode + Into; - type Visited: Visited; type Queue: Queue; - fn visited(&mut self) -> &mut Self::Visited; fn start_queue(&mut self) -> Self::Queue; fn on_node(&mut self, node: &Self::Node) -> Option>; fn node_condition(&mut self, node: Self::Node) -> bool { - self.visited().insert(node) + true } fn begin_run(&mut self) {} fn finish_run(&mut self) {} @@ -56,4 +54,3 @@ pub trait TraversalPass : Sized { self.finish_run() } } - diff --git a/ngrams/src/graph/traversal/queue.rs b/ngrams/src/graph/traversal/queue.rs index c408e59..e72a888 100644 --- a/ngrams/src/graph/traversal/queue.rs +++ b/ngrams/src/graph/traversal/queue.rs @@ -2,7 +2,7 @@ use std::{collections::VecDeque, ops::{Deref, DerefMut}}; use itertools::Itertools; -use crate::graph::{utils::cover::FrequencyCover, vocabulary::{ +use crate::graph::{utils::cover::frequency::FrequencyCover, vocabulary::{ entry::VertexCtx, NGramId, Vocabulary, diff --git a/ngrams/src/graph/traversal/visited.rs b/ngrams/src/graph/traversal/visited.rs index 17e0028..97c3056 100644 --- a/ngrams/src/graph/traversal/visited.rs +++ b/ngrams/src/graph/traversal/visited.rs @@ -20,17 +20,33 @@ use seqraph::{ HashSet, }; -pub trait Visited { - fn insert(&mut self, node:

::Node) -> bool; +use super::pass::PassNode; + +pub trait Visited: TraversalPass { + type Collection: VisitorCollection; + fn visited(&mut self) -> &mut Self::Collection; +} +pub trait VisitorCollection { + type Ref<'t>: VisitorCollection where N: 't; + fn insert(&mut self, node: N) -> bool; } -impl Visited

for HashSet { - fn insert(&mut self, node:

::Node) -> bool { - HashSet::insert(self, node) +impl VisitorCollection for HashSet +{ + type Ref<'t> = &'t mut Self where N: 't; + fn insert(&mut self, node: N) -> bool { + <&mut Self as VisitorCollection>::insert(&mut &mut *self, node) + } +} +impl<'a, N: PassNode> VisitorCollection for &'a mut HashSet { + type Ref<'t> = &'t mut HashSet where N: 't; + fn insert(&mut self, node: N) -> bool { + HashSet::insert(*self, node) } } -impl Visited

for () { - fn insert(&mut self, node:

::Node) -> bool { +impl VisitorCollection for () { + type Ref<'t> = Self where N: 't; + fn insert(&mut self, node: N) -> bool { true } } \ No newline at end of file diff --git a/ngrams/src/graph/utils/tree/child_cover.rs b/ngrams/src/graph/utils/cover/child.rs similarity index 71% rename from ngrams/src/graph/utils/tree/child_cover.rs rename to ngrams/src/graph/utils/cover/child.rs index 1b7bb8e..b9e18ef 100644 --- a/ngrams/src/graph/utils/tree/child_cover.rs +++ b/ngrams/src/graph/utils/cover/child.rs @@ -5,18 +5,13 @@ use seqraph::{ graph::{ getters::vertex::VertexSet, vertex::{ - child::Child, - data::{ + child::Child, data::{ VertexData, VertexDataBuilder, - }, - has_vertex_index::{ + }, has_vertex_index::{ HasVertexIndex, ToChild, - }, - has_vertex_key::HasVertexKey, - wide::Wide, - VertexIndex, + }, has_vertex_key::HasVertexKey, key::VertexKey, wide::Wide, VertexIndex }, Hypergraph, }, @@ -55,7 +50,7 @@ use crate::graph::{ TopDown, TraversalDirection, }, - pass::TraversalPass, queue::{LayeredQueue, Queue}, + pass::TraversalPass, queue::{LayeredQueue, Queue}, visited::Visited, }, vocabulary::{ entry::{ @@ -68,44 +63,47 @@ use crate::graph::{ }, }; -use super::ChildTree; -#[derive(Debug, new)] +use super::ChildCover; + +#[derive(Debug)] pub struct ChildCoverPass<'a> { pub ctx: &'a LabellingCtx, - pub root: &'a VertexCtx<'a>, - #[new(default)] - pub visited: ::Visited, - #[new(default)] - pub tree: ChildTree, + pub root: VertexKey, + pub cover: ChildCover, +} +impl<'a> ChildCoverPass<'a> { + pub fn new(ctx: &'a LabellingCtx, root: VertexKey) -> Self { + Self { + ctx, + root, + cover: Default::default(), + } + } } impl TraversalPass for ChildCoverPass<'_> { - type Visited = HashSet; type Node = (usize, NGramId); type NextNode = (usize, NGramId); type Queue = LayeredQueue; - fn visited(&mut self) -> &mut Self::Visited { - &mut self.visited - } fn start_queue(&mut self) -> Self::Queue { Self::Queue::from_iter( - TopDown::next_nodes(self.root) + TopDown::next_nodes(&self.ctx.vocab.expect_vertex(&self.root)) ) } fn on_node(&mut self, node: &Self::Node) -> Option> { let &(off, node) = node; // check if covered - if self.tree.any_covers(off, node) + if self.cover.any_covers(off, node) { None } else if self.ctx.labels.contains(&node) { - self.tree.insert(off, node); + self.cover.insert(off, node); None } else { - let ne = self.root.vocab.get_vertex(&node).unwrap(); + let ne = self.ctx.vocab.get_vertex(&node).unwrap(); Some( TopDown::next_nodes(&ne) .into_iter() diff --git a/ngrams/src/graph/utils/cover.rs b/ngrams/src/graph/utils/cover/frequency.rs similarity index 100% rename from ngrams/src/graph/utils/cover.rs rename to ngrams/src/graph/utils/cover/frequency.rs diff --git a/ngrams/src/graph/utils/tree/mod.rs b/ngrams/src/graph/utils/cover/mod.rs similarity index 80% rename from ngrams/src/graph/utils/tree/mod.rs rename to ngrams/src/graph/utils/cover/mod.rs index 488fca6..337b63b 100644 --- a/ngrams/src/graph/utils/tree/mod.rs +++ b/ngrams/src/graph/utils/cover/mod.rs @@ -1,26 +1,23 @@ -pub mod child_cover; -pub mod child_dedup; +pub mod child; +pub mod parent; +pub mod frequency; -use child_cover::ChildCoverPass; +use child::ChildCoverPass; use itertools::Itertools; use pretty_assertions::assert_matches; use range_ext::intersect::Intersect; +use derivative::Derivative; use seqraph::{ graph::{ getters::vertex::VertexSet, vertex::{ - child::Child, - data::{ + child::Child, data::{ VertexData, VertexDataBuilder, - }, - has_vertex_index::{ + }, has_vertex_index::{ HasVertexIndex, ToChild, - }, - has_vertex_key::HasVertexKey, - wide::Wide, - VertexIndex, + }, has_vertex_key::HasVertexKey, key::VertexKey, wide::Wide, VertexIndex }, Hypergraph, }, @@ -72,25 +69,25 @@ use crate::graph::{ }, }; -#[derive(Debug, Deref, DerefMut, Default, IntoIterator)] -pub struct ChildTree +#[derive(Debug, Deref, DerefMut, Default, IntoIterator, new)] +pub struct ChildCover { + #[into_iterator(owned, ref)] #[deref] #[deref_mut] - #[into_iterator(owned, ref)] entries: HashMap, } -impl ChildTree +impl ChildCover { // find largest labelled children - pub fn from_entry( + pub fn from_key( ctx: &LabellingCtx, - entry: &VertexCtx<'_>, + key: VertexKey, ) -> Self { - let mut ctx = ChildCoverPass::new(ctx, entry); + let mut ctx = ChildCoverPass::new(ctx, key); ctx.run(); - ctx.tree + ctx.cover } pub fn as_ranges(&self) -> HashSet> { diff --git a/ngrams/src/graph/utils/cover/parent.rs b/ngrams/src/graph/utils/cover/parent.rs new file mode 100644 index 0000000..cbc7ade --- /dev/null +++ b/ngrams/src/graph/utils/cover/parent.rs @@ -0,0 +1,136 @@ +use itertools::Itertools; +use pretty_assertions::assert_matches; +use range_ext::intersect::Intersect; +use seqraph::{ + graph::{ + getters::vertex::VertexSet, + vertex::{ + child::Child, data::{ + VertexData, + VertexDataBuilder, + }, has_vertex_index::{ + HasVertexIndex, + ToChild, + }, has_vertex_key::HasVertexKey, key::VertexKey, wide::Wide, VertexIndex + }, + Hypergraph, + }, + HashMap, + HashSet, +}; +use std::{ + cmp::{ + Ordering, + Reverse, + }, + collections::VecDeque, + fmt::{ + Display, + Formatter, + }, + num::NonZeroUsize, + ops::Range, +}; + +use derive_new::new; +use derive_more::{ + Deref, + DerefMut, + IntoIterator, +}; + +use crate::graph::{ + labelling::LabellingCtx, + partitions::{ + NodePartitionCtx, + PartitionsCtx, + }, + traversal::{ + direction::{ + TopDown, + TraversalDirection, + }, + pass::TraversalPass, queue::{LayeredQueue, Queue}, visited::Visited, + }, + vocabulary::{ + entry::{ + HasVertexEntries, + VertexCtx, + VocabEntry, + }, + NGramId, + ProcessStatus, Vocabulary, + }, +}; + +use super::ChildCover; + +#[derive(Debug)] +pub struct ParentCoverPass<'a> { + pub ctx: &'a LabellingCtx, + pub root: VertexKey, + pub cover: ChildCover, +} +impl<'a> ParentCoverPass<'a> { + pub fn new(ctx: &'a LabellingCtx, root: VertexKey) -> Self { + Self { + ctx, + root, + cover: Default::default(), + } + } +} +impl TraversalPass for ParentCoverPass<'_> { + type Node = (usize, NGramId); + type NextNode = (usize, NGramId); + type Queue = LayeredQueue; + fn start_queue(&mut self) -> Self::Queue { + Self::Queue::from_iter( + TopDown::next_nodes(&self.ctx.vocab.expect_vertex(&self.root)) + ) + } + fn on_node(&mut self, node: &Self::Node) -> Option> { + let &(off, node) = node; + // check if covered + if self.cover.any_covers(off, node) + { + None + } + else if self.ctx.labels.contains(&node) + { + self.cover.insert(off, node); + None + } + else + { + let ne = self.ctx.vocab.get_vertex(&node).unwrap(); + Some( + TopDown::next_nodes(&ne) + .into_iter() + .map(|(o, c)| (o + off, c)) + .collect() + ) + } + } + + fn run(&mut self) { + self.begin_run(); + let mut queue = self.start_queue(); + + while !queue.is_empty() + { + while let Some(node) = queue.pop_front() + { + let node = node.into(); + if let Some(next) = self.node_condition(node) + .then(|| self.on_node(&node)) + .flatten() + { + queue.extend_layer(next); + } + } + queue.finish_layer() + } + self.finish_run() + } +} diff --git a/ngrams/src/graph/utils/tree/child_dedup.rs b/ngrams/src/graph/utils/dedup/mod.rs similarity index 57% rename from ngrams/src/graph/utils/tree/child_dedup.rs rename to ngrams/src/graph/utils/dedup/mod.rs index e314792..3bfe682 100644 --- a/ngrams/src/graph/utils/tree/child_dedup.rs +++ b/ngrams/src/graph/utils/dedup/mod.rs @@ -5,18 +5,13 @@ use seqraph::{ graph::{ getters::vertex::VertexSet, vertex::{ - child::Child, - data::{ + child::Child, data::{ VertexData, VertexDataBuilder, - }, - has_vertex_index::{ + }, has_vertex_index::{ HasVertexIndex, ToChild, - }, - has_vertex_key::HasVertexKey, - wide::Wide, - VertexIndex, + }, has_vertex_key::HasVertexKey, key::VertexKey, wide::Wide, VertexIndex }, Hypergraph, }, @@ -55,7 +50,7 @@ use crate::graph::{ TopDown, TraversalDirection, }, - pass::TraversalPass, queue::{LayeredQueue, Queue}, + pass::TraversalPass, queue::{LayeredQueue, Queue}, visited::Visited, }, vocabulary::{ entry::{ @@ -68,49 +63,58 @@ use crate::graph::{ }, }; -use super::ChildTree; -#[derive(Debug, new)] +use super::cover::ChildCover; + +#[derive(Debug)] pub struct ChildDedupPass<'a> { pub ctx: &'a LabellingCtx, - pub root: &'a VertexCtx<'a>, - #[new(default)] - pub tree: ChildTree, - #[new(default)] - pub visited: ::Visited, + pub covers: HashMap, +} + +impl<'a> ChildDedupPass<'a> { + pub fn new(ctx: &'a LabellingCtx, roots: impl IntoIterator) -> Self { + Self { + ctx, + covers: roots.into_iter().map(|root| (root, ChildCover::default())).collect(), + } + } } + impl TraversalPass for ChildDedupPass<'_> { - type Visited = HashSet; - type Node = (usize, NGramId); - type NextNode = (usize, NGramId); + type Node = (VertexKey, usize, NGramId); + type NextNode = (VertexKey, usize, NGramId); type Queue = LayeredQueue; - fn visited(&mut self) -> &mut Self::Visited { - &mut self.visited - } fn start_queue(&mut self) -> Self::Queue { Self::Queue::from_iter( - TopDown::next_nodes(self.root) + //TopDown::next_nodes(&self.ctx.vocab.expect_vertex(&self.root)) + self.covers.iter().flat_map(|(key, tree)| + TopDown::next_nodes(&self.ctx.vocab.expect_vertex(key)) + .into_iter() + .map(|(p, n)| (*key, p, n)) + ) ) } fn on_node(&mut self, node: &Self::Node) -> Option> { - let &(off, node) = node; + let &(root, off, node) = node; + let cover = self.covers.get_mut(&root).unwrap(); // check if covered - if self.tree.any_covers(off, node) + if cover.any_covers(off, node) { None } else if self.ctx.labels.contains(&node) { - self.tree.insert(off, node); + cover.insert(off, node); None } else { - let ne = self.root.vocab.get_vertex(&node).unwrap(); + let ne = self.ctx.vocab.get_vertex(&node).unwrap(); Some( TopDown::next_nodes(&ne) .into_iter() - .map(|(o, c)| (o + off, c)) + .map(|(o, c)| (root, o + off, c)) .collect() ) } diff --git a/ngrams/src/graph/utils/mod.rs b/ngrams/src/graph/utils/mod.rs index 4b15932..934ee87 100644 --- a/ngrams/src/graph/utils/mod.rs +++ b/ngrams/src/graph/utils/mod.rs @@ -1,2 +1,2 @@ pub mod cover; -pub mod tree; \ No newline at end of file +pub mod dedup; \ No newline at end of file diff --git a/ngrams/src/graph/vocabulary/entry.rs b/ngrams/src/graph/vocabulary/entry.rs index f3c6f90..6503459 100644 --- a/ngrams/src/graph/vocabulary/entry.rs +++ b/ngrams/src/graph/vocabulary/entry.rs @@ -31,7 +31,10 @@ use crate::graph::{ Vocabulary, }, }; -use std::borrow::Borrow; +use std::{ + fmt::Debug, + borrow::Borrow, +}; #[derive(Debug, Deref, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct VocabEntry @@ -82,7 +85,7 @@ pub struct VertexCtxMut<'a> } // define how to access a graph // useful if you store extra labels for nodes by which to query -pub trait HasVertexEntries +pub trait HasVertexEntries { fn entry( &mut self, @@ -96,6 +99,20 @@ pub trait HasVertexEntries &mut self, key: &K, ) -> Option; + fn expect_vertex( + &self, + key: &K, + ) -> VertexCtx { + self.get_vertex(key) + .expect(&format!("No VertexKey: {:?}", key)) + } + fn expect_vertex_mut( + &mut self, + key: &K, + ) -> VertexCtxMut { + self.get_vertex_mut(key) + .expect(&format!("No VertexKey: {:?}", key)) + } } pub trait VocabIndex: HasVertexIndex {} //impl VocabIndex for VertexIndex {}