From b2e654b2f60b47086e47772645687149c0ce8c1e Mon Sep 17 00:00:00 2001 From: Chielo Newctle Date: Wed, 18 Oct 2023 10:49:52 +0800 Subject: [PATCH 1/5] chore(readme): update readme --- README.md | 12 ++++++------ src/lib.rs | 38 +++++++++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index cae909a..0982873 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,10 @@ use general_sam::sam::GeneralSAM; let sam = GeneralSAM::construct_from_bytes("abcbc"); // => GeneralSAM -// "cbc" is a suffix. +// "cbc" is a suffix of "abcbc" assert!(sam.get_root_state().feed_bytes("cbc").is_accepting()); -// "bcb" isn't a suffix. +// "bcb" is not a suffix of "abcbc" assert!(!sam.get_root_state().feed_bytes("bcb").is_accepting()); ``` @@ -57,19 +57,19 @@ let sam = GeneralSAM::construct_from_chars("abcbc".chars()); let state = sam.get_root_state(); -// "b" is not a suffix but a substring. +// "b" is not a suffix but at least a substring of "abcbc" let state = state.feed_chars("b"); assert!(!state.is_accepting()); -// "bc" is a suffix. +// "bc" is a suffix of "abcbc" let state = state.feed_chars("c"); assert!(state.is_accepting()); -// "bcbc" is also a suffix. +// "bcbc" is a suffix of "abcbc" let state = state.feed_chars("bc"); assert!(state.is_accepting()); -// "bcbcbc" is not a substring. +// "bcbcbc" is not a substring, much less a suffix of "abcbc" let state = state.feed_chars("bc"); assert!(!state.is_accepting() && state.is_nil()); ``` diff --git a/src/lib.rs b/src/lib.rs index 8fef252..8f91480 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,28 @@ //! This crate provides an implementation of a general suffix automaton. //! -//! | [![the suffix automaton of abcbc][sam-of-abcbc]][sam-oi-wiki] | -//! | :----------------------------------------------------------------------------: | -//! | The suffix automaton of abcbc, image from [后缀自动机 - OI Wiki][sam-oi-wiki]. | +//! ```mermaid +//! flowchart LR +//! init((ε)) +//! a((a)) +//! b((b)) +//! ab((ab)) +//! bc(((bc))) +//! abc((abc)) +//! abcb((abcb)) +//! abcbc(((abcbc))) +//! +//! init -- a --> a +//! init -- b --> b +//! a -- b --> ab +//! b -- c --> bc +//! init -- c --> bc +//! ab -- c --> abc +//! bc -- b --> abcb +//! abc -- b --> abcb +//! abcb -- c --> abcbc +//! ``` //! -//! [sam-of-abcbc]: https://oi-wiki.org/string/images/SAM/SA_suffix_links.svg -//! [sam-oi-wiki]: https://oi-wiki.org/string/sam/ +//! > The suffix automaton of abcbc. //! //! # Examples //! @@ -15,7 +32,10 @@ //! let sam = GeneralSAM::construct_from_bytes("abcbc"); //! // => GeneralSAM //! +//! // "cbc" is a suffix of "abcbc" //! assert!(sam.get_root_state().feed_bytes("cbc").is_accepting()); +//! +//! // "bcb" is not a suffix of "abcbc" //! assert!(!sam.get_root_state().feed_bytes("bcb").is_accepting()); //! ``` //! @@ -26,12 +46,20 @@ //! // => GeneralSAM //! //! let state = sam.get_root_state(); +//! +//! // "b" is not a suffix but at least a substring of "abcbc" //! let state = state.feed_chars("b"); //! assert!(!state.is_accepting()); +//! +//! // "bc" is a suffix of "abcbc" //! let state = state.feed_chars("c"); //! assert!(state.is_accepting()); +//! +//! // "bcbc" is a suffix of "abcbc" //! let state = state.feed_chars("bc"); //! assert!(state.is_accepting()); +//! +//! // "bcbcbc" is not a substring, much less a suffix of "abcbc" //! let state = state.feed_chars("bc"); //! assert!(!state.is_accepting() && state.is_nil()); //! ``` From 9a57f5f2a4993aa282c7ea79a235a2bff0cee0e8 Mon Sep 17 00:00:00 2001 From: Chielo Newctle Date: Wed, 18 Oct 2023 10:50:24 +0800 Subject: [PATCH 2/5] chore(format): reformat tire.rs --- src/trie.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/trie.rs b/src/trie.rs index d84269e..f1c531a 100644 --- a/src/trie.rs +++ b/src/trie.rs @@ -45,7 +45,10 @@ impl TrieNode { impl Default for Trie { fn default() -> Self { Self { - node_pool: vec![TrieNode::new(TRIE_NIL_NODE_ID), TrieNode::new(TRIE_NIL_NODE_ID)], + node_pool: vec![ + TrieNode::new(TRIE_NIL_NODE_ID), + TrieNode::new(TRIE_NIL_NODE_ID), + ], } } } @@ -86,7 +89,10 @@ impl Trie { node_id } - pub fn insert_ref_iter<'s, Iter: Iterator>(&'s mut self, iter: Iter) -> TrieNodeID { + pub fn insert_ref_iter<'s, Iter: Iterator>( + &'s mut self, + iter: Iter, + ) -> TrieNodeID { self.insert_iter(iter.cloned()) } From 094c2573483200c0b375554e52ee0b9b65ddad40 Mon Sep 17 00:00:00 2001 From: Chielo Newctle Date: Wed, 18 Oct 2023 10:51:32 +0800 Subject: [PATCH 3/5] feat(sam): export `get_node` interface --- src/sam/mod.rs | 8 ++++++++ src/sam/state.rs | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/sam/mod.rs b/src/sam/mod.rs index 30b1ce9..7a20f2e 100644 --- a/src/sam/mod.rs +++ b/src/sam/mod.rs @@ -91,6 +91,14 @@ impl GeneralSAM { self.node_pool.len() } + pub fn get_root_node(&self) -> &GeneralSAMNode { + self.get_node(SAM_ROOT_NODE_ID).unwrap() + } + + pub fn get_node(&self, node_id: GeneralSAMNodeID) -> Option<&GeneralSAMNode> { + self.node_pool.get(node_id) + } + pub fn get_root_state(&self) -> GeneralSAMState { self.get_state(SAM_ROOT_NODE_ID) } diff --git a/src/sam/state.rs b/src/sam/state.rs index 7708613..d9fbaad 100644 --- a/src/sam/state.rs +++ b/src/sam/state.rs @@ -37,8 +37,8 @@ impl<'s, T: Ord + Clone> GeneralSAMState<'s, T> { .unwrap_or(false) } - pub fn get_node(&self) -> Option<&'_ GeneralSAMNode> { - self.sam.node_pool.get(self.node_id) + pub fn get_node(&self) -> Option<&GeneralSAMNode> { + self.sam.get_node(self.node_id) } pub fn goto_suffix_parent(&mut self) { From 595ac24e00c6b18f114093e4348ff469a8aa3be1 Mon Sep 17 00:00:00 2001 From: Chielo Newctle Date: Wed, 18 Oct 2023 10:56:50 +0800 Subject: [PATCH 4/5] feat(sam): rename topological sorting related interfaces A topological ordering of states is not necessarily an ordering from suffix parents to their children. So an arbitrary topological ordering cannot be used for updating accepting states. However, the implementation is still correct. Because the topological sorting is done with a queue, where **NO** states with longer maximum suffix lengths are visited earlier than states with shorter ones. So the interface names related to topological sorting are changed into more precise ones. --- src/sam/mod.rs | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/src/sam/mod.rs b/src/sam/mod.rs index 7a20f2e..881879a 100644 --- a/src/sam/mod.rs +++ b/src/sam/mod.rs @@ -23,7 +23,7 @@ pub struct GeneralSAMNode { #[derive(Debug, Clone)] pub struct GeneralSAM { node_pool: Vec>, - topo_order: Vec, + topo_and_suf_len_sorted_order: Vec, } impl GeneralSAMNode { @@ -81,7 +81,7 @@ impl Default for GeneralSAM { GeneralSAMNode::new(false, 0, SAM_NIL_NODE_ID), GeneralSAMNode::new(true, 0, SAM_NIL_NODE_ID), ], - topo_order: Default::default(), + topo_and_suf_len_sorted_order: Default::default(), } } } @@ -114,8 +114,11 @@ impl GeneralSAM { } } - pub fn get_topo_sorted_node_ids(&self) -> &Vec { - &self.topo_order + /// Returns topological sorted, maximum suffix length sorted + /// and suffix parent depth sorted node id sequence, + /// which is generated by topological sorting with a queue. + pub fn get_topo_and_suf_len_sorted_node_ids(&self) -> &Vec { + &self.topo_and_suf_len_sorted_order } pub fn construct_from_trie(node: TN) -> Self @@ -127,7 +130,7 @@ impl GeneralSAM { let accept_empty_string = node.is_accepting(); sam.build_with_trie(node); - sam.topo_sort(); + sam.topo_sort_with_queue(); sam.update_accepting(); sam.node_pool[SAM_ROOT_NODE_ID].accept = accept_empty_string; @@ -159,9 +162,9 @@ impl GeneralSAM { .unwrap(); } - fn topo_sort(&mut self) { - let mut in_degree: Vec = Vec::new(); - in_degree.resize(self.node_pool.len(), 0); + fn topo_sort_with_queue(&mut self) { + let mut in_degree: Vec = vec![0; self.num_of_nodes()]; + self.node_pool.iter().for_each(|node| { node.trans.values().for_each(|v| { in_degree[*v] += 1; @@ -169,27 +172,31 @@ impl GeneralSAM { }); assert!(in_degree[SAM_ROOT_NODE_ID] == 0); - self.topo_order.reserve(self.node_pool.len()); + self.topo_and_suf_len_sorted_order + .reserve(self.node_pool.len()); - self.topo_order.push(SAM_ROOT_NODE_ID); + self.topo_and_suf_len_sorted_order.push(SAM_ROOT_NODE_ID); let mut head = 0; - while head < self.topo_order.len() { - let u_id = self.topo_order[head]; + while head < self.topo_and_suf_len_sorted_order.len() { + let u_id = self.topo_and_suf_len_sorted_order[head]; head += 1; self.node_pool[u_id].trans.values().for_each(|v_id| { in_degree[*v_id] -= 1; if in_degree[*v_id] == 0 { - self.topo_order.push(*v_id); + self.topo_and_suf_len_sorted_order.push(*v_id); } }); } } fn update_accepting(&mut self) { - self.topo_order.iter().rev().for_each(|node_id| { - let link_id = self.node_pool[*node_id].link; - self.node_pool[link_id].accept |= self.node_pool[*node_id].accept; - }); + self.topo_and_suf_len_sorted_order + .iter() + .rev() + .for_each(|node_id| { + let link_id = self.node_pool[*node_id].link; + self.node_pool[link_id].accept |= self.node_pool[*node_id].accept; + }); self.node_pool[SAM_NIL_NODE_ID].accept = false; } From 2267184b7958927fc949eaadd55b2cc1bf63f78a Mon Sep 17 00:00:00 2001 From: Chielo Newctle Date: Wed, 18 Oct 2023 10:59:04 +0800 Subject: [PATCH 5/5] chore(test): add a test to verify `get_topo_and_suf_len_sorted_node_ids` --- Cargo.lock | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 3 +++ src/tests.rs | 58 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 128 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index ad8a5b4..8f841c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,74 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "general-sam" version = "0.2.0" +dependencies = [ + "rand", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "libc" +version = "0.2.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" diff --git a/Cargo.toml b/Cargo.toml index 96d5f6c..2bb03a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,6 @@ exclude = ["release-plz.toml", "cliff.tolm"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] name = "general_sam" + +[dev-dependencies] +rand = "0.8.5" diff --git a/src/tests.rs b/src/tests.rs index fbeab67..8aa703d 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,4 +1,10 @@ -use crate::{sam::GeneralSAM, trie::Trie}; +use rand::{ + distributions::{Alphanumeric, DistString}, + rngs::StdRng, + Rng, SeedableRng, +}; + +use crate::{sam::GeneralSAM, trie::Trie, SAM_ROOT_NODE_ID}; #[test] fn test_example_from_chars() { @@ -165,3 +171,53 @@ fn test_simple_trie_suffix() { let vocab = ["ac", "bb", "b", "cc", "aabb", "a", "ba", "c", "aa"]; test_trie_suffix(&vocab); } + +#[test] +fn test_topo_and_suf_len_sorted_order() { + let mut rng = StdRng::seed_from_u64(1134759173975); + for _ in 0..10000 { + let mut trie = Trie::default(); + for _ in 0..rng.gen_range(0..32) { + let len = rng.gen_range(0..9); + let string = Alphanumeric.sample_string(&mut rng, len); + trie.insert_ref_iter(string.as_bytes().iter()); + } + + let sam: GeneralSAM = GeneralSAM::construct_from_trie(trie.get_root_state()); + + let order = sam.get_topo_and_suf_len_sorted_node_ids(); + let rank = { + let mut rank = vec![0; sam.num_of_nodes()]; + order.iter().enumerate().for_each(|(k, i)| { + rank[*i] = k; + }); + rank + }; + + // verify that max suffix lengths should be sorted + for pos in 0..order.len() - 1 { + assert!( + sam.get_node(order[pos]).unwrap().max_suffix_len() + <= sam.get_node(order[pos + 1]).unwrap().max_suffix_len() + ); + } + + // verify topological ordering + order.iter().for_each(|node_id| { + let node = sam.get_node(*node_id).unwrap(); + + node.get_trans().values().for_each(|next_node_id| { + assert!(rank[*next_node_id] > rank[*node_id]); + }); + }); + + // verify suffix parent tree depth ordering + order.iter().for_each(|node_id| { + let node = sam.get_node(*node_id).unwrap(); + + if *node_id != SAM_ROOT_NODE_ID { + assert!(rank[node.get_suffix_parent_id()] < rank[*node_id]); + } + }); + } +}