Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable MacOS and Windows build #8

Closed
wants to merge 8 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -45,6 +45,59 @@ jobs:
name: wheels-linux-${{ matrix.platform.target }}
path: dist

windows:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: windows-latest
target: x64
- runner: windows-latest
target: x86
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
architecture: ${{ matrix.platform.target }}
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-windows-${{ matrix.platform.target }}
path: dist

macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-12
target: x86_64
- runner: macos-14
target: aarch64
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}
path: dist

sdist:
runs-on: ubuntu-22.04
steps:
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Ignore binary model files
# but keep heliport.data dir so maturin picks it up when doing a clean build
# Training files
*.train

# Wheels
wheels*
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## v0.8.0
### Added
- Model creation command.

### Changed
- Include binarized model in the wheel.
10 changes: 7 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -14,6 +14,9 @@ crate-type = ["lib", "cdylib"]
[workspace]
members = ["heliport-model"]

[profile.release]
lto = "thin"

[build-dependencies]
heliport-model = { path = "heliport-model" }
anyhow = "1.0"
@@ -29,7 +32,7 @@ ordered-float = "4.2"
log = { version = "0.4" }
env_logger = "0.10"
strum = { version = "0.25", features = ["derive"] }
pyo3 = { version = "0.22", features = ["gil-refs", "anyhow"], optional = true }
pyo3 = { version = "0.23", features = ["anyhow"], optional = true }
target = { version = "2.1.0", optional = true }
tempfile = { version = "3", optional = true }
reqwest = { version = "0.12", features = ["stream", "rustls-tls"], optional = true }
@@ -40,13 +43,14 @@ anyhow = "1.0"
rayon = "1.10"
itertools = "0.11"
lazy_static = "1.5"
counter = "0.6.0"

[dev-dependencies]
test-log = "0.2.15"

[features]
# Put log features in default, to allow crates using heli as a library, disable them
default = ["cli", "log/max_level_debug", "log/release_max_level_debug"]
cli = ["python", "dep:clap", "dep:target"]
download = ["dep:tokio", "dep:tempfile", "dep:reqwest", "dep:futures-util"]
cli = ["python", "dep:clap"]
download = ["dep:tokio", "dep:tempfile", "dep:reqwest", "dep:futures-util", "dep:target"]
python = ["dep:pyo3"]
33 changes: 23 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -14,6 +14,8 @@ Install it in your environment
pip install heliport
```

NOTE: Since version 0.8 models do not need to be downloaded anymore.

### From source
Install the requirements:
- Python
@@ -50,16 +52,27 @@ Arguments:
[OUTPUT_FILE] Output file, default: stdout

Options:
-j, --threads <THREADS> Number of parallel threads to use.
0 means no multi-threading
1 means running the identification in a separated thread
>1 run multithreading [default: 0]
-b, --batch-size <BATCH_SIZE> Number of text segments to pre-load for parallel processing [default: 100000]
-c, --ignore-confidence Ignore confidence thresholds. Predictions under the thresholds will not be labeled as 'und'
-s, --print-scores Print confidence score (higher is better) or raw score (higher is better) in case '-c' is provided
-m, --model-dir <MODEL_DIR> Model directory containing binarized model or plain text model. Default is Python module path or './LanguageModels' if relevant languages are requested
-l, --relevant-langs <RELEVANT_LANGS> Load only relevant languages. Specify a comma-separated list of language codes. Needs plain text model directory
-h, --help Print help
-j, --threads <THREADS>
Number of parallel threads to use.
0 means no multi-threading
1 means running the identification in a separated thread
>1 run multithreading [default: 0]
-b, --batch-size <BATCH_SIZE>
Number of text segments to pre-load for parallel processing [default: 100000]
-c, --ignore-confidence
Ignore confidence thresholds. Predictions under the thresholds will not be
labeled as 'und'
-s, --print-scores
Print confidence score (higher is better) or raw score (higher is better) in case
'-c' is provided
-m, --model-dir <MODEL_DIR>
Model directory containing binarized model or plain text model. Default is Python
module path or './LanguageModels' if relevant languages are requested
-l, --relevant-langs <RELEVANT_LANGS>
Load only relevant languages. Specify a comma-separated list of language codes.
Needs plain text model directory
-h, --help
Print help
```

### Python package
1 change: 1 addition & 0 deletions heliport-model/Cargo.toml
Original file line number Diff line number Diff line change
@@ -10,3 +10,4 @@ strum = { version = "0.25", features = ["derive"] }
strum_macros = "0.25"
wyhash2 = "0.2.1"
anyhow = "1.0"
rayon = "1.10"
32 changes: 22 additions & 10 deletions heliport-model/src/languagemodel.rs
Original file line number Diff line number Diff line change
@@ -10,6 +10,7 @@ use std::thread;
use anyhow::{Context, Result, bail};
use bitcode;
use log::{info, debug, warn};
use rayon::prelude::*;
use strum::{Display, EnumCount, IntoEnumIterator};
use strum_macros::EnumIter;

@@ -87,6 +88,7 @@ impl ModelNgram {
dic: HashMap::default(),
model_type: model_type.clone(),
};
let model_repr = model_type.to_string();

// Open languagelist for this model
let lang_list = fs::read_to_string(model_dir.join("languagelist"))
@@ -99,7 +101,7 @@ impl ModelNgram {
let lang_repr = lang.to_string().to_lowercase();
// Models may not have all the language codes supported by the library
if !lang_list.contains(&lang_repr[..]) {
warn!("Language '{lang_repr}' not found in languagelist, omitting");
warn!("{model_repr}: Language '{lang_repr}' not found in languagelist, omitting");
continue;
}

@@ -291,16 +293,26 @@ impl Index<usize> for Model {

/// Binarize models and save in a path
pub fn binarize(save_path: &Path, model_path: &Path) -> Result<()> {
for model_type in OrderNgram::iter() {
let type_repr = model_type.to_string();
info!("Loading {type_repr} model");
let model = ModelNgram::from_text(&model_path, model_type, None)?;
let size = model.dic.len();
info!("Created {size} entries");
let filename = save_path.join(format!("{type_repr}.bin"));
info!("Saving {type_repr} model");
model.save(Path::new(&filename))?;
let orders: Vec<_ > = OrderNgram::iter().collect();

let results: Vec<Result<_>> = orders
.par_iter()
.panic_fuse()
.map(|model_type| -> Result<()> {
let type_repr = model_type.to_string();
info!("{type_repr}: loading text model");
let model = ModelNgram::from_text(&model_path, model_type.clone(), None)?;
let size = model.dic.len();
let filename = save_path.join(format!("{type_repr}.bin"));
info!("{type_repr}: saving binarized model with {size} entries");
model.save(Path::new(&filename))
}).collect();

// If there is one error, propagate
for r in results {
let _ = r?;
}

info!("Copying confidence thresholds file");
fs::copy(
model_path.join(Model::CONFIDENCE_FILE),
51 changes: 51 additions & 0 deletions src/cli/create_models.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use std::path::{PathBuf};
use std::process::exit;
use std::time::Instant;

use anyhow::Context;
use clap::Args;
use log::{info, error};
use pyo3::prelude::*;
use rayon::prelude::*;

use crate::utils::Abort;
use crate::trainer::count_all_ngrams;

#[derive(Args, Clone)]
pub struct CreateModelCmd {
#[arg(help="Output directory to save the ngram frequency files")]
output_dir: PathBuf,
#[arg(help="Directory where input text files are located")]
input_files: Vec<PathBuf>,
#[arg(short = 'k', long, default_value_t = 10000, help="Truncate at top-k most frequent n-grams")]
topk: usize,
}

impl CreateModelCmd {
pub fn cli(self) -> PyResult<()> {
info!("Starting");
let now = Instant::now();

if !self.output_dir.exists() {
error!("Output directory '{}' does not exist, please create it", self.output_dir.display());
exit(1);
}

info!("Saving top {} most frequent n-grams", self.topk);

// Train each file/language in parallel
// use panic_fuse to fail early if one of the jobs fail
self.input_files
.into_par_iter()
.panic_fuse()
.for_each(|lang_file| {
count_all_ngrams(&lang_file, &self.output_dir, self.topk)
.with_context(|| format!("Error with file '{}'", lang_file.display()))
.or_abort(1);
});

info!("Finished");
info!("Elapsed time: {:.2?}", now.elapsed());
Ok(())
}
}
14 changes: 13 additions & 1 deletion src/cli/identify.rs
Original file line number Diff line number Diff line change
@@ -2,11 +2,12 @@ use std::io::{self, BufRead, BufReader, Write, BufWriter};
use std::fs::File;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::time::Instant;

use anyhow::{Context, Result};
use clap::Args;
use itertools::Itertools;
use log::{debug};
use log::{info, debug};
use pyo3::prelude::*;

use heliport_model::Lang;
@@ -71,10 +72,14 @@ fn parse_langs(langs_text: &Vec<String>) -> Result<Vec<Lang>> {

impl IdentifyCmd {
pub fn cli(self) -> PyResult<()> {
info!("Starting");
let now = Instant::now();

// If provided, parse the list of relevant languages
let mut relevant_langs = None;
if let Some(r) = &self.relevant_langs {
relevant_langs = Some(parse_langs(&r).or_abort(1));
info!("Using relevant langs: {:?}", relevant_langs.as_ref().unwrap());
}
debug!("{:?}", self);

@@ -106,19 +111,26 @@ impl IdentifyCmd {
output_file = Box::new(io::stdout().lock());
}

info!("Loading model");
// Load identifier
let mut identifier = Identifier::load(&model_dir, relevant_langs)
.or_abort(1);
if self.ignore_confidence {
info!("Disabled confidence thresholds");
identifier.disable_confidence();
}

// do not run on separated threads if multithreading is not requested
if self.threads == 0 {
info!("Running single-threaded");
self.run_single(identifier, input_file, output_file).or_abort(1);
} else {
info!("Running with {} threads", self.threads);
self.run_parallel(identifier, input_file, output_file).or_abort(1);
}

info!("Finished");
info!("Elapsed time: {:.2?}", now.elapsed());
Ok(())
}

13 changes: 12 additions & 1 deletion src/cli/mod.rs
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@ mod identify;
#[cfg(feature = "download")]
mod download;
mod binarize;
mod create_models;

use clap::{Subcommand, Parser};
use log::{debug};
@@ -13,12 +14,15 @@ use crate::python::module_path;
use self::download::DownloadCmd;
use self::binarize::BinarizeCmd;
use self::identify::IdentifyCmd;
use self::create_models::CreateModelCmd;

#[derive(Parser, Clone)]
#[command(version, about, long_about = None)]
pub struct Cli {
#[command(subcommand)]
command: Commands,
#[arg(short, long, help="Do not print log messages")]
quiet: bool,
}

#[derive(Subcommand, Clone)]
@@ -31,6 +35,8 @@ enum Commands {
Binarize(BinarizeCmd),
#[command(about="Identify languages of input text", visible_alias="detect")]
Identify(IdentifyCmd),
#[command(about="Create heliport models")]
CreateModel(CreateModelCmd),
}


@@ -41,12 +47,17 @@ pub fn cli_run() -> PyResult<()> {
let os_args = std::env::args_os().skip(1);
let args = Cli::parse_from(os_args);
debug!("Module path found at: {}", module_path().expect("Could not found module path").display());
env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
if !args.quiet {
env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
} else {
env_logger::Builder::from_env(Env::default().default_filter_or("error")).init();
}

match args.command {
#[cfg(feature = "download")]
Commands::Download(cmd) => { cmd.cli() },
Commands::Binarize(cmd) => { cmd.cli() },
Commands::Identify(cmd) => { cmd.cli() },
Commands::CreateModel(cmd) => { cmd.cli() },
}
}
8 changes: 1 addition & 7 deletions src/identifier.rs
Original file line number Diff line number Diff line change
@@ -5,23 +5,17 @@ use std::sync::{Arc, Mutex};
use ordered_float::OrderedFloat;
use strum::{IntoEnumIterator, EnumCount};
use shingles::AsShingles;
use regex::Regex;
use anyhow::Result;
use log::{debug,warn};
use lazy_static::lazy_static;
use rayon::prelude::*;

#[cfg(feature = "python")]
use pyo3::pyclass;

use heliport_model::Model;
use heliport_model::{Lang, LangScores, LangBitmap};
use crate::utils::is_cjk_block;
use crate::utils::{is_cjk_block, RE_NON_ALPHA};

lazy_static! {
static ref RE_NON_ALPHA: Regex = Regex::new(r#"[^#gc\p{L}\p{M}′'’´ʹािीुूृेैोौंँः् া ি ী ু ূ ৃ ে ৈ ো ৌ।্্্я̄\u07A6\u07A7\u07A8\u07A9\u07AA\u07AB\u07AC\u07AD\u07AE\u07AF\u07B0\u0A81\u0A82\u0A83\u0ABC\u0ABD\u0ABE\u0ABF\u0AC0\u0AC1\u0AC2\u0AC3\u0AC4\u0AC5\u0AC6\u0AC7\u0AC8\u0AC9\u0ACA\u0ACB\u0ACC\u0ACD\u0AD0\u0AE0\u0AE1\u0AE2\u0AE3\u0AE4\u0AE5\u0AE6\u0AE7\u0AE8\u0AE9\u0AEA\u0AEB\u0AEC\u0AED\u0AEE\u0AEF\u0AF0\u0AF1]"#)
.expect("Error compiling non-alpha regex for Idenfifier");
}

#[cfg_attr(feature = "python", pyclass)]
pub struct Identifier {
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -6,3 +6,4 @@ pub mod utils;
mod cli;
#[cfg(feature = "python")]
mod python;
pub mod trainer;
6 changes: 3 additions & 3 deletions src/python.rs
Original file line number Diff line number Diff line change
@@ -12,12 +12,12 @@ pub fn module_path() -> PyResult<PathBuf> {
let mut path = PathBuf::new();
Python::with_gil(|py| {
// Instead of hardcoding the module name, obtain it from the crate name at compile time
let module = PyModule::import_bound(py, env!("CARGO_PKG_NAME"))?;
let paths: Vec<&str> = module
let module = PyModule::import(py, env!("CARGO_PKG_NAME"))?;
let paths: Vec<String> = module
.getattr("__path__")?
.extract()?;
// __path__ attribute returns a list of paths, return first
path.push(paths[0]);
path.push(&paths[0]);
Ok(path)
})
}
114 changes: 114 additions & 0 deletions src/trainer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
use std::fs::File;
use std::io::{BufRead, BufReader, Write, BufWriter};
use std::path::Path;

use anyhow::{Context, Result};
use counter::Counter;
use lazy_static::lazy_static;
use log::{info, debug};
use rayon::prelude::*;
use regex::Regex;
use shingles::AsShingles;
use strum::IntoEnumIterator;

use crate::utils::RE_NON_ALPHA;

use heliport_model::{OrderNgram};


lazy_static! {
static ref RE_LANG_NAME: Regex = Regex::new(r"(\w{3,7}).train$")
.expect("Error compiling lang name from file regex");
}

// Count n-gram frequency of a given n-gram order in the text contained in the file
fn count_ngrams(input_file_path: &Path, order: OrderNgram) -> Result<Counter<String>> {
let input_file = BufReader::new(File::open(input_file_path)?);
let mut counts = Counter::new();

// Read training file line by line and accumulate ngram counts
for line_res in input_file.lines() {
let line = line_res?;
// Replace punctuation by spaces
let replaced = RE_NON_ALPHA.replace_all(&line, " ");

// iterate over words
for word in replaced.split_whitespace() {
// if current order is word, just count the words
// otherwise put the space boundaries in the word
// and generate all possible ngrams of the current order
// and count them
if order == OrderNgram::Word {
if let Some(entry) = counts.get_mut(word) {
*entry += 1;
} else {
counts.insert(String::from(word), 1);
}
} else {
let wordspace = format!(" {word} ");
// order can be cast to integer because the internal representations
// have the same number (word is 0, unigram is 1 and so on)
for gram in wordspace.as_shingles(order as usize) {
if let Some(entry) = counts.get_mut(gram) {
*entry += 1;
} else {
counts.insert(String::from(gram), 1);
}
}
}
}
}

Ok(counts)
}

// Count n-gram frequency of all n-gram orders for a given lanuage
pub fn count_all_ngrams(input_file_path: &Path, output_dir: &Path, top_k: usize) -> Result<()> {
// use the lang prefix in the input file as language code
let string_file_name = input_file_path.to_string_lossy();
let lang_string = RE_LANG_NAME
.captures(&string_file_name)
.context("Could not parse language name from input_file")?
.get(1)
.with_context(|| "Could not get first capture group from lang name regex")?
.as_str();
// Check that the language exists
// avoid this for now, as it will require compile with a new lang before training
// let lang = Lang::from_str(&lang_string)
// .with_context(|| format!("Could not parse lang '{lang_string}'"))?;
info!("Training '{lang_string}'");

// Run training for each nggram order in parallel
let ngram_orders: Vec<_> = OrderNgram::iter().collect();
let results: Vec<Result<_>> = ngram_orders
.into_par_iter()
.map(|order| -> Result<()> {
// Obtain nggram frequencies
let counts = count_ngrams(input_file_path, order)?;
// create output file with the language code and ngram order as name
let output_file =
File::create(output_dir.join(format!("{}.{}.model", lang_string, order.to_string())))
.with_context(|| "Could not create file")?;
let mut output_file = BufWriter::new(output_file);
let total = counts.total::<usize>();
debug!(
"Total: {} top-10: {:?}",
total,
counts.k_most_common_ordered(10)
);

// Write the top-k most frequent n-grams with their frequencies and the total count
writeln!(&mut output_file, "{}", total)?;
for (ngram, count) in counts.k_most_common_ordered(top_k) {
writeln!(&mut output_file, "{ngram}\t{count}")?;
}
Ok(())
}).collect();

for r in results {
let _ = r?;
}

info!("Finished '{lang_string}'");
Ok(())
}
10 changes: 9 additions & 1 deletion src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
use std::process::exit;

use lazy_static::lazy_static;
use log::error;
use regex::Regex;
use unicode_blocks;

lazy_static! {
pub static ref RE_NON_ALPHA: Regex = Regex::new(r#"[^#gc\p{L}\p{M}′'’´ʹािीुूृेैोौंँः् া ি ী ু ূ ৃ ে ৈ ো ৌ।্্্я̄\u07A6\u07A7\u07A8\u07A9\u07AA\u07AB\u07AC\u07AD\u07AE\u07AF\u07B0\u0A81\u0A82\u0A83\u0ABC\u0ABD\u0ABE\u0ABF\u0AC0\u0AC1\u0AC2\u0AC3\u0AC4\u0AC5\u0AC6\u0AC7\u0AC8\u0AC9\u0ACA\u0ACB\u0ACC\u0ACD\u0AD0\u0AE0\u0AE1\u0AE2\u0AE3\u0AE4\u0AE5\u0AE6\u0AE7\u0AE8\u0AE9\u0AEA\u0AEB\u0AEC\u0AED\u0AEE\u0AEF\u0AF0\u0AF1]"#)
.expect("Error compiling non-alpha regex for Idenfifier");
}

// Trait that extracts the contained ok value or aborts if error
// sending the error message to the log
pub trait Abort<T> {
@@ -14,7 +21,8 @@ impl<T, E: std::fmt::Display> Abort<T> for Result<T, E>
fn or_abort(self, exit_code: i32) -> T {
match self {
Ok(v) => v,
Err(e) => { error!("{e}"); exit(exit_code); },
// Print the whole error context with :#
Err(e) => { error!("{e:#}"); exit(exit_code); },
}
}
}