ZJaume · ZJaume · Oct 31, 2024 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -45,6 +45,59 @@ jobs:
           name: wheels-linux-${{ matrix.platform.target }}
           path: dist
 
+  windows:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: windows-latest
+            target: x64
+          - runner: windows-latest
+            target: x86
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+          architecture: ${{ matrix.platform.target }}
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --release --out dist --find-interpreter
+          sccache: 'true'
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-windows-${{ matrix.platform.target }}
+          path: dist
+
+  macos:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: macos-12
+            target: x86_64
+          - runner: macos-14
+            target: aarch64
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --release --out dist --find-interpreter
+          sccache: 'true'
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-macos-${{ matrix.platform.target }}
+          path: dist
+
   sdist:
     runs-on: ubuntu-22.04
     steps:

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
-# Ignore binary model files
-# but keep heliport.data dir so maturin picks it up when doing a clean build
+# Training files
+*.train
 
 # Wheels
 wheels*

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## v0.8.0
 ### Added
+- Model creation command.
 
 ### Changed
 - Include binarized model in the wheel.

diff --git a/Cargo.toml b/Cargo.toml
@@ -14,6 +14,9 @@ crate-type = ["lib", "cdylib"]
 [workspace]
 members = ["heliport-model"]
 
+[profile.release]
+lto = "thin"
+
 [build-dependencies]
 heliport-model = { path = "heliport-model" }
 anyhow = "1.0"
@@ -29,7 +32,7 @@ ordered-float = "4.2"
 log = { version = "0.4" }
 env_logger = "0.10"
 strum = { version = "0.25", features = ["derive"] }
-pyo3 = { version = "0.22", features = ["gil-refs", "anyhow"], optional = true }
+pyo3 = { version = "0.23", features = ["anyhow"], optional = true }
 target = { version = "2.1.0", optional = true }
 tempfile = { version = "3", optional = true }
 reqwest = { version = "0.12", features = ["stream", "rustls-tls"], optional = true }
@@ -40,13 +43,14 @@ anyhow = "1.0"
 rayon = "1.10"
 itertools = "0.11"
 lazy_static = "1.5"
+counter = "0.6.0"
 
 [dev-dependencies]
 test-log = "0.2.15"
 
 [features]
 # Put log features in default, to allow crates using heli as a library, disable them
 default = ["cli", "log/max_level_debug", "log/release_max_level_debug"]
-cli = ["python", "dep:clap", "dep:target"]
-download = ["dep:tokio", "dep:tempfile", "dep:reqwest", "dep:futures-util"]
+cli = ["python", "dep:clap"]
+download = ["dep:tokio", "dep:tempfile", "dep:reqwest", "dep:futures-util", "dep:target"]
 python = ["dep:pyo3"]
diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@ Install it in your environment
 pip install heliport
 ```
 
+NOTE: Since version 0.8 models do not need to be downloaded anymore.
+
 ### From source
 Install the requirements:
  - Python
@@ -50,16 +52,27 @@ Arguments:
   [OUTPUT_FILE]  Output file, default: stdout
 
 Options:
-  -j, --threads <THREADS>                Number of parallel threads to use.
-                                         0 means no multi-threading
-                                         1 means running the identification in a separated thread
-                                         >1 run multithreading [default: 0]
-  -b, --batch-size <BATCH_SIZE>          Number of text segments to pre-load for parallel processing [default: 100000]
-  -c, --ignore-confidence                Ignore confidence thresholds. Predictions under the thresholds will not be labeled as 'und'
-  -s, --print-scores                     Print confidence score (higher is better) or raw score (higher is better) in case '-c' is provided
-  -m, --model-dir <MODEL_DIR>            Model directory containing binarized model or plain text model. Default is Python module path or './LanguageModels' if relevant languages are requested
-  -l, --relevant-langs <RELEVANT_LANGS>  Load only relevant languages. Specify a comma-separated list of language codes. Needs plain text model directory
-  -h, --help                             Print help
+  -j, --threads <THREADS>
+          Number of parallel threads to use.
+          0 means no multi-threading
+          1 means running the identification in a separated thread
+          >1 run multithreading [default: 0]
+  -b, --batch-size <BATCH_SIZE>
+          Number of text segments to pre-load for parallel processing [default: 100000]
+  -c, --ignore-confidence
+          Ignore confidence thresholds. Predictions under the thresholds will not be
+          labeled as 'und'
+  -s, --print-scores
+          Print confidence score (higher is better) or raw score (higher is better) in case
+          '-c' is provided
+  -m, --model-dir <MODEL_DIR>
+          Model directory containing binarized model or plain text model. Default is Python
+          module path or './LanguageModels' if relevant languages are requested
+  -l, --relevant-langs <RELEVANT_LANGS>
+          Load only relevant languages. Specify a comma-separated list of language codes.
+          Needs plain text model directory
+  -h, --help
+          Print help
 ```
 
 ### Python package

diff --git a/heliport-model/Cargo.toml b/heliport-model/Cargo.toml
@@ -10,3 +10,4 @@ strum = { version = "0.25", features = ["derive"] }
 strum_macros = "0.25"
 wyhash2 = "0.2.1"
 anyhow = "1.0"
+rayon = "1.10"
diff --git a/heliport-model/src/languagemodel.rs b/heliport-model/src/languagemodel.rs
@@ -10,6 +10,7 @@ use std::thread;
 use anyhow::{Context, Result, bail};
 use bitcode;
 use log::{info, debug, warn};
+use rayon::prelude::*;
 use strum::{Display, EnumCount, IntoEnumIterator};
 use strum_macros::EnumIter;
 
@@ -87,6 +88,7 @@ impl ModelNgram {
             dic: HashMap::default(),
             model_type: model_type.clone(),
         };
+        let model_repr = model_type.to_string();
 
         // Open languagelist for this model
         let lang_list = fs::read_to_string(model_dir.join("languagelist"))
@@ -99,7 +101,7 @@ impl ModelNgram {
             let lang_repr = lang.to_string().to_lowercase();
             // Models may not have all the language codes supported by the library
             if !lang_list.contains(&lang_repr[..]) {
-                warn!("Language '{lang_repr}' not found in languagelist, omitting");
+                warn!("{model_repr}: Language '{lang_repr}' not found in languagelist, omitting");
                 continue;
             }
 
@@ -291,16 +293,26 @@ impl Index<usize> for Model {
 
 /// Binarize models and save in a path
 pub fn binarize(save_path: &Path, model_path: &Path) -> Result<()> {
-    for model_type in OrderNgram::iter() {
-        let type_repr = model_type.to_string();
-        info!("Loading {type_repr} model");
-        let model = ModelNgram::from_text(&model_path, model_type, None)?;
-        let size = model.dic.len();
-        info!("Created {size} entries");
-        let filename = save_path.join(format!("{type_repr}.bin"));
-        info!("Saving {type_repr} model");
-        model.save(Path::new(&filename))?;
+    let orders: Vec<_ > = OrderNgram::iter().collect();
+
+    let results: Vec<Result<_>> = orders
+        .par_iter()
+        .panic_fuse()
+        .map(|model_type| -> Result<()> {
+            let type_repr = model_type.to_string();
+            info!("{type_repr}: loading text model");
+            let model = ModelNgram::from_text(&model_path, model_type.clone(), None)?;
+            let size = model.dic.len();
+            let filename = save_path.join(format!("{type_repr}.bin"));
+            info!("{type_repr}: saving binarized model with {size} entries");
+            model.save(Path::new(&filename))
+        }).collect();
+
+    // If there is one error, propagate
+    for r in results {
+        let _ = r?;
     }
+
     info!("Copying confidence thresholds file");
     fs::copy(
         model_path.join(Model::CONFIDENCE_FILE),

diff --git a/src/cli/create_models.rs b/src/cli/create_models.rs
@@ -0,0 +1,51 @@
+use std::path::{PathBuf};
+use std::process::exit;
+use std::time::Instant;
+
+use anyhow::Context;
+use clap::Args;
+use log::{info, error};
+use pyo3::prelude::*;
+use rayon::prelude::*;
+
+use crate::utils::Abort;
+use crate::trainer::count_all_ngrams;
+
+#[derive(Args, Clone)]
+pub struct CreateModelCmd {
+    #[arg(help="Output directory to save the ngram frequency files")]
+    output_dir: PathBuf,
+    #[arg(help="Directory where input text files are located")]
+    input_files: Vec<PathBuf>,
+    #[arg(short = 'k', long, default_value_t = 10000, help="Truncate at top-k most frequent n-grams")]
+    topk: usize,
+}
+
+impl CreateModelCmd {
+    pub fn cli(self) -> PyResult<()> {
+        info!("Starting");
+        let now = Instant::now();
+
+        if !self.output_dir.exists() {
+            error!("Output directory '{}' does not exist, please create it", self.output_dir.display());
+            exit(1);
+        }
+
+        info!("Saving top {} most frequent n-grams", self.topk);
+
+        // Train each file/language in parallel
+        // use panic_fuse to fail early if one of the jobs fail
+        self.input_files
+            .into_par_iter()
+            .panic_fuse()
+            .for_each(|lang_file| {
+                count_all_ngrams(&lang_file, &self.output_dir, self.topk)
+                    .with_context(|| format!("Error with file '{}'", lang_file.display()))
+                    .or_abort(1);
+            });
+
+        info!("Finished");
+        info!("Elapsed time: {:.2?}", now.elapsed());
+        Ok(())
+    }
+}
diff --git a/src/cli/identify.rs b/src/cli/identify.rs
@@ -2,11 +2,12 @@ use std::io::{self, BufRead, BufReader, Write, BufWriter};
 use std::fs::File;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
+use std::time::Instant;
 
 use anyhow::{Context, Result};
 use clap::Args;
 use itertools::Itertools;
-use log::{debug};
+use log::{info, debug};
 use pyo3::prelude::*;
 
 use heliport_model::Lang;
@@ -71,10 +72,14 @@ fn parse_langs(langs_text: &Vec<String>) -> Result<Vec<Lang>> {
 
 impl IdentifyCmd {
     pub fn cli(self) -> PyResult<()> {
+        info!("Starting");
+        let now = Instant::now();
+
         // If provided, parse the list of relevant languages
         let mut relevant_langs = None;
         if let Some(r) = &self.relevant_langs {
             relevant_langs = Some(parse_langs(&r).or_abort(1));
+            info!("Using relevant langs: {:?}", relevant_langs.as_ref().unwrap());
         }
         debug!("{:?}", self);
 
@@ -106,19 +111,26 @@ impl IdentifyCmd {
             output_file = Box::new(io::stdout().lock());
         }
 
+        info!("Loading model");
         // Load identifier
         let mut identifier = Identifier::load(&model_dir, relevant_langs)
             .or_abort(1);
         if self.ignore_confidence {
+            info!("Disabled confidence thresholds");
             identifier.disable_confidence();
         }
 
         // do not run on separated threads if multithreading is not requested
         if self.threads == 0 {
+            info!("Running single-threaded");
             self.run_single(identifier, input_file, output_file).or_abort(1);
         } else {
+            info!("Running with {} threads", self.threads);
             self.run_parallel(identifier, input_file, output_file).or_abort(1);
         }
+
+        info!("Finished");
+        info!("Elapsed time: {:.2?}", now.elapsed());
         Ok(())
     }
 

diff --git a/src/cli/mod.rs b/src/cli/mod.rs
@@ -2,6 +2,7 @@ mod identify;
 #[cfg(feature = "download")]
 mod download;
 mod binarize;
+mod create_models;
 
 use clap::{Subcommand, Parser};
 use log::{debug};
@@ -13,12 +14,15 @@ use crate::python::module_path;
 use self::download::DownloadCmd;
 use self::binarize::BinarizeCmd;
 use self::identify::IdentifyCmd;
+use self::create_models::CreateModelCmd;
 
 #[derive(Parser, Clone)]
 #[command(version, about, long_about = None)]
 pub struct Cli {
     #[command(subcommand)]
     command: Commands,
+    #[arg(short, long, help="Do not print log messages")]
+    quiet: bool,
 }
 
 #[derive(Subcommand, Clone)]
@@ -31,6 +35,8 @@ enum Commands {
     Binarize(BinarizeCmd),
     #[command(about="Identify languages of input text", visible_alias="detect")]
     Identify(IdentifyCmd),
+    #[command(about="Create heliport models")]
+    CreateModel(CreateModelCmd),
 }
 
 
@@ -41,12 +47,17 @@ pub fn cli_run() -> PyResult<()> {
     let os_args = std::env::args_os().skip(1);
     let args = Cli::parse_from(os_args);
     debug!("Module path found at: {}", module_path().expect("Could not found module path").display());
-    env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
+    if !args.quiet {
+        env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
+    } else {
+        env_logger::Builder::from_env(Env::default().default_filter_or("error")).init();
+    }
 
     match args.command {
         #[cfg(feature = "download")]
         Commands::Download(cmd) => { cmd.cli() },
         Commands::Binarize(cmd) => { cmd.cli() },
         Commands::Identify(cmd) => { cmd.cli() },
+        Commands::CreateModel(cmd) => { cmd.cli() },
     }
 }
diff --git a/src/identifier.rs b/src/identifier.rs
@@ -5,23 +5,17 @@ use std::sync::{Arc, Mutex};
 use ordered_float::OrderedFloat;
 use strum::{IntoEnumIterator, EnumCount};
 use shingles::AsShingles;
-use regex::Regex;
 use anyhow::Result;
 use log::{debug,warn};
-use lazy_static::lazy_static;
 use rayon::prelude::*;
 
 #[cfg(feature = "python")]
 use pyo3::pyclass;
 
 use heliport_model::Model;
 use heliport_model::{Lang, LangScores, LangBitmap};
-use crate::utils::is_cjk_block;
+use crate::utils::{is_cjk_block, RE_NON_ALPHA};
 
-lazy_static! {
-    static ref RE_NON_ALPHA: Regex = Regex::new(r#"[^#gc\p{L}\p{M}′'’´ʹािीुूृेैोौंँः् া ি ী ু ূ ৃ ে ৈ ো ৌ।্্্я̄\u07A6\u07A7\u07A8\u07A9\u07AA\u07AB\u07AC\u07AD\u07AE\u07AF\u07B0\u0A81\u0A82\u0A83\u0ABC\u0ABD\u0ABE\u0ABF\u0AC0\u0AC1\u0AC2\u0AC3\u0AC4\u0AC5\u0AC6\u0AC7\u0AC8\u0AC9\u0ACA\u0ACB\u0ACC\u0ACD\u0AD0\u0AE0\u0AE1\u0AE2\u0AE3\u0AE4\u0AE5\u0AE6\u0AE7\u0AE8\u0AE9\u0AEA\u0AEB\u0AEC\u0AED\u0AEE\u0AEF\u0AF0\u0AF1]"#)
-            .expect("Error compiling non-alpha regex for Idenfifier");
-}
 
 #[cfg_attr(feature = "python", pyclass)]
 pub struct Identifier {

diff --git a/src/lib.rs b/src/lib.rs
@@ -6,3 +6,4 @@ pub mod utils;
 mod cli;
 #[cfg(feature = "python")]
 mod python;
+pub mod trainer;
diff --git a/src/python.rs b/src/python.rs
@@ -12,12 +12,12 @@ pub fn module_path() -> PyResult<PathBuf> {
     let mut path = PathBuf::new();
     Python::with_gil(|py| {
         // Instead of hardcoding the module name, obtain it from the crate name at compile time
-        let module = PyModule::import_bound(py, env!("CARGO_PKG_NAME"))?;
-        let paths: Vec<&str> = module
+        let module = PyModule::import(py, env!("CARGO_PKG_NAME"))?;
+        let paths: Vec<String> = module
             .getattr("__path__")?
             .extract()?;
         // __path__ attribute returns a list of paths, return first
-        path.push(paths[0]);
+        path.push(&paths[0]);
         Ok(path)
     })
 }

diff --git a/src/trainer.rs b/src/trainer.rs
@@ -0,0 +1,114 @@
+use std::fs::File;
+use std::io::{BufRead, BufReader, Write, BufWriter};
+use std::path::Path;
+
+use anyhow::{Context, Result};
+use counter::Counter;
+use lazy_static::lazy_static;
+use log::{info, debug};
+use rayon::prelude::*;
+use regex::Regex;
+use shingles::AsShingles;
+use strum::IntoEnumIterator;
+
+use crate::utils::RE_NON_ALPHA;
+
+use heliport_model::{OrderNgram};
+
+
+lazy_static! {
+    static ref RE_LANG_NAME: Regex = Regex::new(r"(\w{3,7}).train$")
+            .expect("Error compiling lang name from file regex");
+}
+
+// Count n-gram frequency of a given n-gram order in the text contained in the file
+fn count_ngrams(input_file_path: &Path, order: OrderNgram) -> Result<Counter<String>> {
+    let input_file = BufReader::new(File::open(input_file_path)?);
+    let mut counts = Counter::new();
+
+    // Read training file line by line and accumulate ngram counts
+    for line_res in input_file.lines() {
+        let line = line_res?;
+        // Replace punctuation by spaces
+        let replaced = RE_NON_ALPHA.replace_all(&line, " ");
+
+        // iterate over words
+        for word in replaced.split_whitespace() {
+            // if current order is word, just count the words
+            // otherwise put the space boundaries in the word
+            // and generate all possible ngrams of the current order
+            // and count them
+            if order == OrderNgram::Word {
+                if let Some(entry) = counts.get_mut(word) {
+                    *entry += 1;
+                } else {
+                    counts.insert(String::from(word), 1);
+                }
+            } else {
+                let wordspace = format!(" {word} ");
+                // order can be cast to integer because the internal representations
+                // have the same number (word is 0, unigram is 1 and so on)
+                for gram in wordspace.as_shingles(order as usize) {
+                    if let Some(entry) = counts.get_mut(gram) {
+                        *entry += 1;
+                    } else {
+                        counts.insert(String::from(gram), 1);
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(counts)
+}
+
+// Count n-gram frequency of all n-gram orders for a given lanuage
+pub fn count_all_ngrams(input_file_path: &Path, output_dir: &Path, top_k: usize) -> Result<()> {
+    // use the lang prefix in the input file as language code
+    let string_file_name = input_file_path.to_string_lossy();
+    let lang_string = RE_LANG_NAME
+        .captures(&string_file_name)
+        .context("Could not parse language name from input_file")?
+        .get(1)
+        .with_context(|| "Could not get first capture group from lang name regex")?
+        .as_str();
+    // Check that the language exists
+    // avoid this for now, as it will require compile with a new lang before training
+    // let lang = Lang::from_str(&lang_string)
+    //     .with_context(|| format!("Could not parse lang '{lang_string}'"))?;
+    info!("Training '{lang_string}'");
+
+    // Run training for each nggram order in parallel
+    let ngram_orders: Vec<_> = OrderNgram::iter().collect();
+    let results: Vec<Result<_>> = ngram_orders
+        .into_par_iter()
+        .map(|order| -> Result<()> {
+            // Obtain nggram frequencies
+            let counts = count_ngrams(input_file_path, order)?;
+            // create output file with the language code and ngram order as name
+            let output_file =
+                File::create(output_dir.join(format!("{}.{}.model", lang_string, order.to_string())))
+                    .with_context(|| "Could not create file")?;
+            let mut output_file = BufWriter::new(output_file);
+            let total = counts.total::<usize>();
+            debug!(
+                "Total: {} top-10: {:?}",
+                total,
+                counts.k_most_common_ordered(10)
+            );
+
+            // Write the top-k most frequent n-grams with their frequencies and the total count
+            writeln!(&mut output_file, "{}", total)?;
+            for (ngram, count) in counts.k_most_common_ordered(top_k) {
+                writeln!(&mut output_file, "{ngram}\t{count}")?;
+            }
+            Ok(())
+        }).collect();
+
+    for r in results {
+        let _ = r?;
+    }
+
+    info!("Finished '{lang_string}'");
+    Ok(())
+}
diff --git a/src/utils.rs b/src/utils.rs
@@ -1,8 +1,15 @@
 use std::process::exit;
 
+use lazy_static::lazy_static;
 use log::error;
+use regex::Regex;
 use unicode_blocks;
 
+lazy_static! {
+    pub static ref RE_NON_ALPHA: Regex = Regex::new(r#"[^#gc\p{L}\p{M}′'’´ʹािीुूृेैोौंँः् া ি ী ু ূ ৃ ে ৈ ো ৌ।্্্я̄\u07A6\u07A7\u07A8\u07A9\u07AA\u07AB\u07AC\u07AD\u07AE\u07AF\u07B0\u0A81\u0A82\u0A83\u0ABC\u0ABD\u0ABE\u0ABF\u0AC0\u0AC1\u0AC2\u0AC3\u0AC4\u0AC5\u0AC6\u0AC7\u0AC8\u0AC9\u0ACA\u0ACB\u0ACC\u0ACD\u0AD0\u0AE0\u0AE1\u0AE2\u0AE3\u0AE4\u0AE5\u0AE6\u0AE7\u0AE8\u0AE9\u0AEA\u0AEB\u0AEC\u0AED\u0AEE\u0AEF\u0AF0\u0AF1]"#)
+            .expect("Error compiling non-alpha regex for Idenfifier");
+}
+
 // Trait that extracts the contained ok value or aborts if error
 // sending the error message to the log
 pub trait Abort<T> {
@@ -14,7 +21,8 @@ impl<T, E: std::fmt::Display> Abort<T> for Result<T, E>
     fn or_abort(self, exit_code: i32) -> T {
         match self {
             Ok(v) => v,
-            Err(e) => { error!("{e}"); exit(exit_code); },
+            // Print the whole error context with :#
+            Err(e) => { error!("{e:#}"); exit(exit_code); },
         }
     }
 }