Skip to content

Commit

Permalink
~4% faster now
Browse files Browse the repository at this point in the history
  • Loading branch information
drahnr committed Apr 7, 2021
1 parent c49c527 commit 2bb7366
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 38 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ members = [
opt-level = 2
build-override = { opt-level = 2 }

[patch.crates-io]
bimap = { git = "https://github.com/drahnr/bimap-rs.git", branch = "master" }

# [profile.release] # BUILD_BINDINGS_UNCOMMENT
# lto = "fat" # BUILD_BINDINGS_UNCOMMENT
# codegen-units = 1 # BUILD_BINDINGS_UNCOMMENT
2 changes: 2 additions & 0 deletions nlprule/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ half = { version = "1.7", features = ["serde"] }
srx = { version = "^0.1.3", features = ["serde"] }
lazycell = "1"
cfg-if = "1"
fnv = "1"
hashbrown = "0.11"

rayon-cond = "0.1"
rayon = "1.5"
Expand Down
9 changes: 5 additions & 4 deletions nlprule/src/compile/impls.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use bimap::BiMap;
use fnv::{FnvBuildHasher, FnvHashSet};
use fs_err::File;
use indexmap::IndexMap;
use log::warn;
Expand Down Expand Up @@ -133,12 +134,12 @@ impl Tagger {
let mut tag_store: Vec<_> = tag_store.iter().collect();
tag_store.sort();

let word_store: BiMap<_, _> = word_store
let word_store: FastBiMap<_, _> = word_store
.iter()
.enumerate()
.map(|(i, x)| (x.to_string(), WordIdInt(i as u32)))
.collect();
let tag_store: BiMap<_, _> = tag_store
let tag_store: FastBiMap<_, _> = tag_store
.iter()
.enumerate()
.map(|(i, x)| (x.to_string(), PosIdInt(i as u16)))
Expand All @@ -149,9 +150,9 @@ impl Tagger {
let inflection_id = word_store.get_by_left(inflection).unwrap();
let pos_id = tag_store.get_by_left(tag).unwrap();

let group = groups.entry(*inflection_id).or_insert_with(Vec::new);
let group = groups.entry(*inflection_id).or_insert_with(HashSet::<WordIdInt, FnvBuildHasher>::default);
if !group.contains(word_id) {
group.push(*word_id);
group.insert(*word_id);
}

tags.entry(*word_id)
Expand Down
58 changes: 25 additions & 33 deletions nlprule/src/tokenizer/tag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
//! where each word typically has multiple entries with different part-of-speech tags.
use crate::types::*;
use bimap::BiMap;
use fst::{IntoStreamer, Map, Streamer};
use indexmap::IndexMap;
use lazycell::AtomicLazyCell;
Expand Down Expand Up @@ -34,7 +33,7 @@ impl Default for TaggerLangOptions {
struct TaggerFields {
tag_fst: Vec<u8>,
word_store_fst: Vec<u8>,
tag_store: BiMap<String, PosIdInt>,
tag_store: FastBiMap<String, PosIdInt>,
lang_options: TaggerLangOptions,
}

Expand All @@ -51,20 +50,11 @@ impl From<Tagger> for TaggerFields {
assert!(i < 255);
i += 1;

let key: Vec<u8> = word.as_bytes().iter().chain(once(&i)).copied().collect();
let pos_bytes = pos_id.0.to_be_bytes();
let inflect_bytes = inflect_id.0.to_be_bytes();

let value = u64::from_be_bytes([
inflect_bytes[0],
inflect_bytes[1],
inflect_bytes[2],
inflect_bytes[3],
0,
0,
pos_bytes[0],
pos_bytes[1],
]);
let key: Vec<u8> = word.as_bytes().iter().copied().chain(once(i)).collect();
let pos_bytes = pos_id.0 as u64;
let inflect_bytes = inflect_id.0 as u64;
let value = (pos_bytes & 0xFFFF) | (inflect_bytes & 0xFFFF_FFFF) << 32;

tag_fst_items.push((key, value));
}
}
Expand Down Expand Up @@ -102,17 +92,21 @@ impl From<Tagger> for TaggerFields {
impl From<TaggerFields> for Tagger {
fn from(data: TaggerFields) -> Self {
let word_store_fst = Map::new(data.word_store_fst).unwrap();
let mut word_store = BiMap::<String, WordIdInt>::with_capacity(word_store_fst.len());
let mut word_store = FastBiMap::<String, WordIdInt>::with_capacity_and_hashers(
word_store_fst.len(),
Default::default(),
Default::default(),
);

let mut stream = word_store_fst.into_stream();
while let Some((key, value)) = stream.next() {
if let Some(key) = std::str::from_utf8(&key[..(key.len().saturating_sub(1))]).ok() {
if let Some(key) = std::str::from_utf8(key).ok() {
word_store.insert(key.to_owned(), WordIdInt(value as u32));
}
};

let mut tags = DefaultHashMap::new();
let mut groups = DefaultHashMap::new();
let mut tags = FastHashMap::new();
let mut groups = FastHashMap::new();

let tag_fst = Map::new(data.tag_fst).unwrap();
let mut stream = tag_fst.into_stream();
Expand All @@ -121,18 +115,16 @@ impl From<TaggerFields> for Tagger {
let word = std::str::from_utf8(&key[..(key.len().saturating_sub(1))]).unwrap();
let word_id = *word_store.get_by_left(word).unwrap();

let inflection_id = WordIdInt((value & 0xFFFF_FFFF as u64) as u32);
let pos_id = PosIdInt((value >> 48) as u16);
let inflection_id = WordIdInt((value >> 32) as u32);
let pos_id = PosIdInt((value & 0xFF_u64) as u16);

let group = groups.entry(inflection_id).or_insert_with(Vec::new);
if !group.contains(&word_id) {
group.push(word_id);
}
let group = groups.entry(inflection_id).or_insert_with(FastHashSet::default);
let _ = group.insert(word_id);

tags.entry(word_id)
.or_insert_with(IndexMap::new)
.entry(inflection_id)
.or_insert_with(Vec::new)
.or_insert_with(|| Vec::with_capacity(32))
.push(pos_id);
}

Expand All @@ -151,10 +143,10 @@ impl From<TaggerFields> for Tagger {
#[derive(Default, Serialize, Deserialize, Clone)]
#[serde(from = "TaggerFields", into = "TaggerFields")]
pub struct Tagger {
pub(crate) tags: DefaultHashMap<WordIdInt, IndexMap<WordIdInt, Vec<PosIdInt>>>,
pub(crate) tag_store: BiMap<String, PosIdInt>,
pub(crate) word_store: BiMap<String, WordIdInt>,
pub(crate) groups: DefaultHashMap<WordIdInt, Vec<WordIdInt>>,
pub(crate) tags: FastHashMap<WordIdInt, IndexMap<WordIdInt, Vec<PosIdInt>>>,
pub(crate) tag_store: FastBiMap<String, PosIdInt>,
pub(crate) word_store: FastBiMap<String, WordIdInt>,
pub(crate) groups: FastHashMap<WordIdInt, FastHashSet<WordIdInt>>,
pub(crate) lang_options: TaggerLangOptions,
pub(crate) sent_start: AtomicLazyCell<Token<'static>>,
}
Expand Down Expand Up @@ -212,12 +204,12 @@ impl Tagger {
}

#[allow(dead_code)] // used by compile module
pub(crate) fn tag_store(&self) -> &BiMap<String, PosIdInt> {
pub(crate) fn tag_store(&self) -> &FastBiMap<String, PosIdInt> {
&self.tag_store
}

#[allow(dead_code)] // used by compile module
pub(crate) fn word_store(&self) -> &BiMap<String, WordIdInt> {
pub(crate) fn word_store(&self) -> &FastBiMap<String, WordIdInt> {
&self.word_store
}

Expand Down
6 changes: 6 additions & 0 deletions nlprule/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,17 @@ use std::{
fmt,
ops::{Add, AddAssign, Range, Sub},
};
use bimap::BiHashMap;


pub(crate) type DefaultHashMap<K, V> = HashMap<K, V>;
pub(crate) type DefaultHashSet<T> = HashSet<T>;
pub(crate) type DefaultHasher = hash_map::DefaultHasher;

pub(crate) type FastBiMap<L,R> = BiHashMap<L, R, hashbrown::hash_map::DefaultHashBuilder, hashbrown::hash_map::DefaultHashBuilder>;
pub(crate) type FastHashSet<I> = hashbrown::HashSet<I>;
pub(crate) type FastHashMap<K,V> = hashbrown::HashMap<K,V>;

#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
#[serde(transparent)]
pub(crate) struct WordIdInt(pub u32);
Expand Down
4 changes: 3 additions & 1 deletion nlprule/src/utils/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,9 @@ cfg_if::cfg_if! {
use regex_impl_all as regex_impl;
} else if #[cfg(feature = "regex-onig")] {
use regex_impl_onig as regex_impl;
} else {
} else if #[cfg(feature = "regex-fancy")] {
use regex_impl_fancy as regex_impl;
} else {
compile_error!{"Must select exactly one regex impl via features: regex-onig OR regex-fancy"}
}
}

0 comments on commit 2bb7366

Please sign in to comment.