bnosac · koheiw · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023 · Nov 21, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,8 +13,8 @@ Description: Learn vector representations of words by continuous bag of words an
 URL: https://github.com/bnosac/word2vec
 License: Apache License (>= 2.0)
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Depends: R (>= 2.10)
-Imports: Rcpp (>= 0.11.5), stats
+Imports: Rcpp (>= 0.11.5), stats, fastmatch
 LinkingTo: Rcpp, RcppProgress
 Suggests: udpipe
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ S3method(predict,word2vec_trained)
 S3method(summary,word2vec)
 S3method(summary,word2vec_trained)
 S3method(word2vec,list)
+S3method(word2vec,tokens)
 export(doc2vec)
 export(read.word2vec)
 export(read.wordvectors)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -21,7 +21,3 @@ w2v_nearest_vector <- function(ptr, x, top_n = 10L, min_distance = 0.0) {
     .Call('_word2vec_w2v_nearest_vector', PACKAGE = 'word2vec', ptr, x, top_n, min_distance)
 }
 
-w2v_read_binary <- function(modelFile, normalize, n) {
-    .Call('_word2vec_w2v_read_binary', PACKAGE = 'word2vec', modelFile, normalize, n)
-}
-
diff --git a/R/word2vec.R b/R/word2vec.R
@@ -156,13 +156,14 @@ word2vec <- function(x,
 #' modelb <- word2vec(x = txt, dim = 15, iter = 20, split = c(" \n\r", "\n\r"))
 #' all.equal(as.matrix(modela), as.matrix(modelb))
 #' \dontshow{\} # End of main if statement running only if the required packages are installed}
-word2vec.list <- function(x,
+word2vec.tokens <- function(x,
                           type = c("cbow", "skip-gram"),
                           dim = 50, window = ifelse(type == "cbow", 5L, 10L), 
                           iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, 
                           stopwords = integer(),
                           threads = 1L,
                           ...){
+
     #x <- lapply(x, as.character)
     type <- match.arg(type)
     stopwords <- as.integer(stopwords)
@@ -182,17 +183,35 @@ word2vec.list <- function(x,
     iter <- as.integer(iter)
     lr <- as.numeric(lr)
     skipgram <- as.logical(type %in% "skip-gram")
-    encoding <- "UTF-8"
-    model <- w2v_train(x, stopwords,
-                       modelFile = model, 
-                       minWordFreq = min_count,
+
+    model <- w2v_train(x, attr(x, "types"), minWordFreq = min_count,
                        size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, 
                        sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter,
                        alpha = lr, withSG = skipgram, ...)
     model$data$stopwords <- stopwords
     model
 }
 
+#' @export
+word2vec.list <- function(x, ...){
+    if (!is.character(attr(x, "types"))) {
+        x <- serialize(x, stopwords)
+        class(x) <- "tokens"
+    }
+    word2vec(x, ...)
+}
+
+serialize <- function(x, stopwords) {
+    vocaburary <- unique(unlist(x, use.names = FALSE))
+    vocaburary <- setdiff(vocaburary, stopwords)
+    x <- lapply(x, function(x) {
+        v <- fastmatch::fmatch(x, vocaburary)
+        v[is.na(v)] <- 0L
+        return(v)
+    })
+    attr(x, "types") <- vocaburary
+    return(x)
+}
 
 #' @title Get the word vectors of a word2vec model
 #' @description Get the word vectors of a word2vec model as a dense matrix.

diff --git a/man/word2vec.list.Rd → man/word2vec.tokens.Rd b/man/word2vec.list.Rd → man/word2vec.tokens.Rd
diff --git a/src/Makevars b/src/Makevars
@@ -2,7 +2,6 @@ PKG_LIBS = -pthread
 PKG_CPPFLAGS = -pthread -DSTRICT_R_HEADERS -I./word2vec/include -I./word2vec/lib
 
 SOURCES = word2vec/lib/huffmanTree.cpp \
-			word2vec/lib/mapper.cpp \
 			word2vec/lib/nsDistribution.cpp \
 			word2vec/lib/trainer.cpp \
 			word2vec/lib/trainThread.cpp \

diff --git a/src/Makevars.win b/src/Makevars.win
@@ -2,7 +2,6 @@ PKG_LIBS = -pthread
 PKG_CPPFLAGS = -pthread -DSTRICT_R_HEADERS -I./word2vec/include -I./word2vec/lib
 
 SOURCES = word2vec/lib/huffmanTree.cpp \
-			word2vec/lib/mapper.cpp \
 			word2vec/lib/nsDistribution.cpp \
 			word2vec/lib/trainer.cpp \
 			word2vec/lib/trainThread.cpp \

diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -88,27 +88,13 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
-// w2v_read_binary
-Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, std::size_t n);
-RcppExport SEXP _word2vec_w2v_read_binary(SEXP modelFileSEXP, SEXP normalizeSEXP, SEXP nSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const std::string >::type modelFile(modelFileSEXP);
-    Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP);
-    Rcpp::traits::input_parameter< std::size_t >::type n(nSEXP);
-    rcpp_result_gen = Rcpp::wrap(w2v_read_binary(modelFile, normalize, n));
-    return rcpp_result_gen;
-END_RCPP
-}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17},
     {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1},
     {"_word2vec_w2v_embedding", (DL_FUNC) &_word2vec_w2v_embedding, 2},
     {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4},
     {"_word2vec_w2v_nearest_vector", (DL_FUNC) &_word2vec_w2v_nearest_vector, 4},
-    {"_word2vec_w2v_read_binary", (DL_FUNC) &_word2vec_w2v_read_binary, 3},
     {NULL, NULL, 0}
 };
 

diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp
@@ -5,7 +5,6 @@
 #include <iostream>
 #include <iomanip>
 #include "word2vec.hpp"
-#include "wordReader.hpp"
 #include <unordered_map>
 
 // [[Rcpp::depends(RcppProgress)]]
@@ -82,29 +81,6 @@ Rcpp::List w2v_train(Rcpp::List texts_,
   if (verbose) { // NOTE: consider removing progress bar
     Progress p(100, true);
     trained = model->train(trainSettings, corpus, 
-                           //trainFile, stopWordsFile, // NOTE: remove
-                           // [&p] (float _percent) {
-                           //   p.update(_percent / 2);
-                           //   /*
-                           //    std::cout << "\rParsing train data... "
-                           //              << std::fixed << std::setprecision(2)
-                           //              << _percent << "%" << std::flush;
-                           //    */
-                           // },
-                           // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) {
-                           //   /*
-                           //    Rcpp::Rcerr << std::endl
-                           //                << "Finished reading data: " << std::endl
-                           //                << "Vocabulary size: " << _vocWords << std::endl
-                           //                << "Train words: " << _trainWords << std::endl
-                           //                << "Total words: " << _totalWords << std::endl
-                           //                << "Start training" << std::endl
-                           //                << std::endl;
-                           //    */
-                           //   vocWords = _vocWords;
-                           //   trainWords = _trainWords;
-                           //   totalWords = _totalWords;
-                           // },
                            [&p] (float _alpha, float _percent) {
                              /*
                               std::cout << '\r'
@@ -119,26 +95,8 @@ Rcpp::List w2v_train(Rcpp::List texts_,
                              p.update(_percent);
                            }
     );
-    //std::cout << std::endl;
   } else {
-    trained = model->train(trainSettings, corpus, 
-                           //trainFile, stopWordsFile, // NOTE: remove
-                           // nullptr, 
-                           // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) {
-                           //   /*
-                           //    Rcpp::Rcerr << std::endl
-                           //                << "Finished reading data: " << std::endl
-                           //                << "Vocabulary size: " << _vocWords << std::endl
-                           //                << "Train words: " << _trainWords << std::endl
-                           //                << "Total words: " << _totalWords << std::endl
-                           //                << "Start training" << std::endl
-                           //                << std::endl;
-                           //    */
-                           //   vocWords = _vocWords;
-                           //   trainWords = _trainWords;
-                           //   totalWords = _totalWords;
-                           // },
-                           nullptr);
+    trained = model->train(trainSettings, corpus, nullptr);
   }
   Rcpp::Rcout << "Training done\n";
   //return Rcpp::List::create();
@@ -313,6 +271,8 @@ Rcpp::List w2v_nearest_vector(SEXP ptr,
   return out;
 }
 
+/* NOTE: temporarily disabled
+
 // [[Rcpp::export]]
 Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, std::size_t n) {
   try {
@@ -416,9 +376,6 @@ Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize,
   return embedding_default;
 }
 
-/* NOTE: temporarily disabled
-
-
 // [[Rcpp::export]]
 Rcpp::List d2vec(SEXP ptr, Rcpp::StringVector x, std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") {
   Rcpp::XPtr<w2v::w2vModel_t> model_w2v(ptr);

diff --git a/src/word2vec/include/mapper.hpp b/src/word2vec/include/mapper.hpp