diff --git a/DESCRIPTION b/DESCRIPTION index 5b5776e..83694c6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,8 +13,8 @@ Description: Learn vector representations of words by continuous bag of words an URL: https://github.com/bnosac/word2vec License: Apache License (>= 2.0) Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Depends: R (>= 2.10) -Imports: Rcpp (>= 0.11.5), stats +Imports: Rcpp (>= 0.11.5), stats, fastmatch LinkingTo: Rcpp, RcppProgress Suggests: udpipe diff --git a/NAMESPACE b/NAMESPACE index e2f823c..520c183 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ S3method(predict,word2vec_trained) S3method(summary,word2vec) S3method(summary,word2vec_trained) S3method(word2vec,list) +S3method(word2vec,tokens) export(doc2vec) export(read.word2vec) export(read.wordvectors) diff --git a/R/RcppExports.R b/R/RcppExports.R index d6c90b8..08c533a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,7 +21,3 @@ w2v_nearest_vector <- function(ptr, x, top_n = 10L, min_distance = 0.0) { .Call('_word2vec_w2v_nearest_vector', PACKAGE = 'word2vec', ptr, x, top_n, min_distance) } -w2v_read_binary <- function(modelFile, normalize, n) { - .Call('_word2vec_w2v_read_binary', PACKAGE = 'word2vec', modelFile, normalize, n) -} - diff --git a/R/word2vec.R b/R/word2vec.R index 6cfbcea..8d85a58 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -156,13 +156,14 @@ word2vec <- function(x, #' modelb <- word2vec(x = txt, dim = 15, iter = 20, split = c(" \n\r", "\n\r")) #' all.equal(as.matrix(modela), as.matrix(modelb)) #' \dontshow{\} # End of main if statement running only if the required packages are installed} -word2vec.list <- function(x, +word2vec.tokens <- function(x, type = c("cbow", "skip-gram"), dim = 50, window = ifelse(type == "cbow", 5L, 10L), iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, stopwords = integer(), threads = 1L, ...){ + #x <- lapply(x, as.character) type <- match.arg(type) stopwords <- as.integer(stopwords) @@ -182,10 +183,8 @@ word2vec.list <- function(x, iter <- as.integer(iter) lr <- as.numeric(lr) skipgram <- as.logical(type %in% "skip-gram") - encoding <- "UTF-8" - model <- w2v_train(x, stopwords, - modelFile = model, - minWordFreq = min_count, + + model <- w2v_train(x, attr(x, "types"), minWordFreq = min_count, size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, alpha = lr, withSG = skipgram, ...) @@ -193,6 +192,26 @@ word2vec.list <- function(x, model } +#' @export +word2vec.list <- function(x, ...){ + if (!is.character(attr(x, "types"))) { + x <- serialize(x, stopwords) + class(x) <- "tokens" + } + word2vec(x, ...) +} + +serialize <- function(x, stopwords) { + vocaburary <- unique(unlist(x, use.names = FALSE)) + vocaburary <- setdiff(vocaburary, stopwords) + x <- lapply(x, function(x) { + v <- fastmatch::fmatch(x, vocaburary) + v[is.na(v)] <- 0L + return(v) + }) + attr(x, "types") <- vocaburary + return(x) +} #' @title Get the word vectors of a word2vec model #' @description Get the word vectors of a word2vec model as a dense matrix. diff --git a/man/word2vec.list.Rd b/man/word2vec.tokens.Rd similarity index 98% rename from man/word2vec.list.Rd rename to man/word2vec.tokens.Rd index b92d8f8..af58632 100644 --- a/man/word2vec.list.Rd +++ b/man/word2vec.tokens.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/word2vec.R -\name{word2vec.list} -\alias{word2vec.list} +\name{word2vec.tokens} +\alias{word2vec.tokens} \title{Train a word2vec model on text} \usage{ -\method{word2vec}{list}( +\method{word2vec}{tokens}( x, type = c("cbow", "skip-gram"), dim = 50, diff --git a/src/Makevars b/src/Makevars index 620ba58..0008eff 100644 --- a/src/Makevars +++ b/src/Makevars @@ -2,7 +2,6 @@ PKG_LIBS = -pthread PKG_CPPFLAGS = -pthread -DSTRICT_R_HEADERS -I./word2vec/include -I./word2vec/lib SOURCES = word2vec/lib/huffmanTree.cpp \ - word2vec/lib/mapper.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ diff --git a/src/Makevars.win b/src/Makevars.win index 459c5a1..948d8c7 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -2,7 +2,6 @@ PKG_LIBS = -pthread PKG_CPPFLAGS = -pthread -DSTRICT_R_HEADERS -I./word2vec/include -I./word2vec/lib SOURCES = word2vec/lib/huffmanTree.cpp \ - word2vec/lib/mapper.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index ebd9209..e72dd60 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -88,19 +88,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// w2v_read_binary -Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, std::size_t n); -RcppExport SEXP _word2vec_w2v_read_binary(SEXP modelFileSEXP, SEXP normalizeSEXP, SEXP nSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::string >::type modelFile(modelFileSEXP); - Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - Rcpp::traits::input_parameter< std::size_t >::type n(nSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_read_binary(modelFile, normalize, n)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17}, @@ -108,7 +95,6 @@ static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_embedding", (DL_FUNC) &_word2vec_w2v_embedding, 2}, {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4}, {"_word2vec_w2v_nearest_vector", (DL_FUNC) &_word2vec_w2v_nearest_vector, 4}, - {"_word2vec_w2v_read_binary", (DL_FUNC) &_word2vec_w2v_read_binary, 3}, {NULL, NULL, 0} }; diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index dc22480..4853e42 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -5,7 +5,6 @@ #include #include #include "word2vec.hpp" -#include "wordReader.hpp" #include // [[Rcpp::depends(RcppProgress)]] @@ -82,29 +81,6 @@ Rcpp::List w2v_train(Rcpp::List texts_, if (verbose) { // NOTE: consider removing progress bar Progress p(100, true); trained = model->train(trainSettings, corpus, - //trainFile, stopWordsFile, // NOTE: remove - // [&p] (float _percent) { - // p.update(_percent / 2); - // /* - // std::cout << "\rParsing train data... " - // << std::fixed << std::setprecision(2) - // << _percent << "%" << std::flush; - // */ - // }, - // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - // /* - // Rcpp::Rcerr << std::endl - // << "Finished reading data: " << std::endl - // << "Vocabulary size: " << _vocWords << std::endl - // << "Train words: " << _trainWords << std::endl - // << "Total words: " << _totalWords << std::endl - // << "Start training" << std::endl - // << std::endl; - // */ - // vocWords = _vocWords; - // trainWords = _trainWords; - // totalWords = _totalWords; - // }, [&p] (float _alpha, float _percent) { /* std::cout << '\r' @@ -119,26 +95,8 @@ Rcpp::List w2v_train(Rcpp::List texts_, p.update(_percent); } ); - //std::cout << std::endl; } else { - trained = model->train(trainSettings, corpus, - //trainFile, stopWordsFile, // NOTE: remove - // nullptr, - // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - // /* - // Rcpp::Rcerr << std::endl - // << "Finished reading data: " << std::endl - // << "Vocabulary size: " << _vocWords << std::endl - // << "Train words: " << _trainWords << std::endl - // << "Total words: " << _totalWords << std::endl - // << "Start training" << std::endl - // << std::endl; - // */ - // vocWords = _vocWords; - // trainWords = _trainWords; - // totalWords = _totalWords; - // }, - nullptr); + trained = model->train(trainSettings, corpus, nullptr); } Rcpp::Rcout << "Training done\n"; //return Rcpp::List::create(); @@ -313,6 +271,8 @@ Rcpp::List w2v_nearest_vector(SEXP ptr, return out; } +/* NOTE: temporarily disabled + // [[Rcpp::export]] Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, std::size_t n) { try { @@ -416,9 +376,6 @@ Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, return embedding_default; } -/* NOTE: temporarily disabled - - // [[Rcpp::export]] Rcpp::List d2vec(SEXP ptr, Rcpp::StringVector x, std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") { Rcpp::XPtr model_w2v(ptr); diff --git a/src/word2vec/include/mapper.hpp b/src/word2vec/include/mapper.hpp deleted file mode 100644 index afa066c..0000000 --- a/src/word2vec/include/mapper.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/** - * @file - * @brief mapper classes - mapping wrappers - * @author Max Fomichev - * @date 19.04.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#ifndef WORD2VEC_MAPPER_H -#define WORD2VEC_MAPPER_H - -#include - -namespace w2v { - /// @brief base class for different data sources (file, std::string etc) to be mapped - class mapper_t { - protected: - union { // mapped memory - char *rwData; // read/write access - const char *roData; // read only access - } m_data; - off_t m_size = 0; // mapped memory size - - public: - mapper_t(): m_data() {} - mapper_t(char *_data, off_t _size): m_data(), m_size(_size) {m_data.rwData = _data;} - mapper_t(const char *_data, off_t _size): m_data(), m_size(_size) {m_data.roData = _data;} - virtual ~mapper_t() = default; - - /// @returns pointer to mapped data in read-only mode - inline const char *data() const noexcept {return m_data.roData;} - /// @returns pointer to mapped data in read/write mode - inline char *data() noexcept {return m_data.rwData;} - /// @returns mapped memory size - inline off_t size() const noexcept {return m_size;} - }; - - class stringMapper_t final: public mapper_t { - public: - /** - * Constructs a fileMapper object for reading or writing, depending on parameters - * @param _fileName file name to be opened for reading or created for writing - * @param _wrFlag create file for writing (default is false - open for reading) - * @param _size size of a new created file (_wrFlag == true) - * @throws std::runtime_error In case of failed file or mapping operations - */ - explicit stringMapper_t(const std::string &_source): - mapper_t(_source.c_str(), static_cast(_source.length())) {} - - // copying prohibited - stringMapper_t(const stringMapper_t &) = delete; - void operator=(const stringMapper_t &) = delete; - }; - /** - * @brief C++ wrapper on mmap() system call - * - * fileMapper class is a simple wrapper on mmap() system call. Both reading from and writing to file are supported. - */ - class fileMapper_t final: public mapper_t { - private: - const std::string m_fileName; // name of the file to be mapped - int m_fd = -1; // file descriptor - const bool m_wrFlag = false; // write mode - - public: - /** - * Constructs a fileMapper object for reading or writing, depending on parameters - * @param _fileName file name to be opened for reading or created for writing - * @param _wrFlag create file for writing (default is false, open for reading) - * @param _size size of a new created file (_wrFlag must be true) - * @throws std::runtime_error In case of failed file or mapping operations - */ - explicit fileMapper_t(const std::string &_fileName, bool _wrFlag = false, off_t _size = 0); - ~fileMapper_t() final; - - // copying prohibited - fileMapper_t(const fileMapper_t &) = delete; - void operator=(const fileMapper_t &) = delete; - }; -} - -#endif //WORD2VEC_MAPPER_H diff --git a/src/word2vec/include/wordReader.hpp b/src/word2vec/include/wordReader.hpp deleted file mode 100644 index 8522d75..0000000 --- a/src/word2vec/include/wordReader.hpp +++ /dev/null @@ -1,132 +0,0 @@ -/** - * @file - * @brief wordReader class - fast text parsing - * @author Max Fomichev - * @date 19.04.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#ifndef WORD2VEC_WORDREADER_H -#define WORD2VEC_WORDREADER_H - -#include -#include -#include - -#include "mapper.hpp" - -namespace w2v { - /** - * @brief Text parser (word by word) - * - * wordReader class is a word by word parser of a file mapped into memory by mapper_t derived class object. - * It makes easy to parse a file like memory allocated char array without any read/write calls etc) - */ - template - class wordReader_t final { - private: - const dataMapper_t &m_mapper; // reference to mapper_t derived class object - std::string m_wordDelimiterChars; - std::string m_endOfSentenceChars; - const uint16_t m_maxWordLen; // max word length - off_t m_offset; // current offset - const off_t m_startFrom; // start from position - const off_t m_stopAt; // stop at position - std::string m_word; // current word buffer - std::size_t m_wordPos = 0; // position in the current word buffer - bool m_prvEOS = false; // is the previous char a sentence delimiter char? - - public: - /** - * Constructs a wordReader of a memory mapped file (_mapper object) - * @param _mapper mapper_t derived class object that provides read access to a mapped memory - * @param _offset start parsing from this offset position - * @param _stopAt stop parsing at this position - * @param _maxWordLen max length of a parsing word - * @throws std::range_error In case of _offset or/and _stopAt are out of bounds - */ - wordReader_t(const dataMapper_t &_mapper, - std::string _wordDelimiterChars, - std::string _endOfSentenceChars, - off_t _offset = 0, off_t _stopAt = 0, uint16_t _maxWordLen = 100): - m_mapper(_mapper), - m_wordDelimiterChars(std::move(_wordDelimiterChars)), - m_endOfSentenceChars(std::move(_endOfSentenceChars)), - m_maxWordLen(_maxWordLen), m_offset(_offset), - m_startFrom(m_offset), m_stopAt((_stopAt == 0)?_mapper.size() - 1:_stopAt), - m_word(m_maxWordLen, 0) { - - if (m_stopAt >= m_mapper.size()) { - throw std::range_error("wordReader: bounds are out of the file size"); - } - if (m_offset > m_stopAt) { - throw std::range_error("wordReader: offset is out of the bounds"); - } - } - - // copying prohibited - wordReader_t(const wordReader_t &) = delete; - void operator=(const wordReader_t &) = delete; - - /// @returns current offset - inline off_t offset() const noexcept {return m_offset;} - - /// Resets parser state, start parsing from the begining - inline void reset() noexcept { - m_offset = m_startFrom; - m_wordPos = 0; - m_prvEOS = false; - } - - /** - * Reads next word - * @param[out] _word string where the next parsed word to be stored. Empty string means end of sentence. - * @returns true if word is succesfuly parsed, false in case of EOF or end of parsing block reached (_stopAt). - */ - inline bool nextWord(std::string &_word) noexcept { - while (m_offset <= m_stopAt) { - char ch = m_mapper.data()[m_offset++]; - if (m_wordDelimiterChars.find(ch) != std::string::npos) { // is it a word/sentence delimiter? - if (m_endOfSentenceChars.find(ch) != std::string::npos) { // is it the end of sentence (EOS)? - if (m_wordPos > 0) { // is here any buffered word? if yes - return this word and move back - m_offset--; - m_prvEOS = false; - break; - } else { - if (!m_prvEOS) { // Do not return repeated EOS, return only the first occurrence. - _word.clear(); - m_prvEOS = true; - return true; - } else { - continue; // skip this EOS - } - } - } - if (m_wordPos > 0) { // it is a word delimiter, is here any buffered word? - m_prvEOS = false; - break; - } else { - continue; // skip repeated word delimiters - } - } - if (m_wordPos < m_maxWordLen) { // check bounds - m_word[m_wordPos++] = ch; // it's next char of buffered word - } - } - if (m_wordPos > 0) { // return buffered word - try { - _word.resize(m_wordPos); - std::copy(m_word.data(), m_word.data() + m_wordPos, &_word[0]); - } catch (...) { // bad_alloc - return false; - } - m_wordPos = 0; - return true; - } - - return false; // eof or end of the requested block - } - }; -} - -#endif // WORD2VEC_WORDREADER_H diff --git a/src/word2vec/lib/CMakeLists.txt b/src/word2vec/lib/CMakeLists.txt index ad6c414..00a6a6d 100644 --- a/src/word2vec/lib/CMakeLists.txt +++ b/src/word2vec/lib/CMakeLists.txt @@ -37,5 +37,3 @@ target_link_libraries(${PROJECT_NAME} ${LIBS}) install(TARGETS ${PROJECT_NAME} DESTINATION lib) install(FILES ${PROJECT_INCLUDE_DIR}/word2vec.hpp DESTINATION include) -install(FILES ${PROJECT_INCLUDE_DIR}/mapper.hpp DESTINATION include) -install(FILES ${PROJECT_INCLUDE_DIR}/wordReader.hpp DESTINATION include) diff --git a/src/word2vec/lib/mapper.cpp b/src/word2vec/lib/mapper.cpp deleted file mode 100644 index 16c0b9e..0000000 --- a/src/word2vec/lib/mapper.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/** - * @file - * @brief fileMapper & wordReader classes - fast text file/memory parsing - * @author Max Fomichev - * @date 19.04.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#include -#include -#ifdef WIN32 -#include "win/mman.h" -#else -#include -#endif -#include -#include -#include - -#include -#include - -#include "mapper.hpp" - -namespace w2v { - fileMapper_t::fileMapper_t(const std::string &_fileName, bool _wrFlag, off_t _size): - mapper_t(), m_fileName(_fileName), m_wrFlag(_wrFlag) { - - if (m_wrFlag) { - m_size = _size; - } - - // open file - m_fd = ::open(m_fileName.c_str(), m_wrFlag?(O_RDWR | O_CREAT):O_RDONLY, 0600); - if (m_fd < 0) { - std::string err = std::string("fileMapper: ") + _fileName + " - " + std::strerror(errno); - throw std::runtime_error(err); - } - - // get file size - struct stat fst{}; - if (fstat(m_fd, &fst) < 0) { - std::string err = std::string("fileMapper: ") + _fileName + " - " + std::strerror(errno); - throw std::runtime_error(err); - } - - if (!m_wrFlag) { - if (fst.st_size <= 0) { - throw std::runtime_error(std::string("fileMapper: file ") + _fileName + " is empty, nothing to read"); - } - - m_size = fst.st_size; - } else { - if (ftruncate(m_fd, m_size) == -1) { - std::string err = std::string("fileMapper: ") + _fileName + " - " + std::strerror(errno); - throw std::runtime_error(err); - } - } - - // map file to memory - m_data.rwData = static_cast(mmap(nullptr, static_cast(m_size), - m_wrFlag?(PROT_READ | PROT_WRITE):PROT_READ , MAP_SHARED, - m_fd, 0)); - if (m_data.rwData == static_cast(MAP_FAILED)) { - std::string err = std::string("fileMapper: ") + _fileName + " - " + std::strerror(errno); - throw std::runtime_error(err); - } - } - - fileMapper_t::~fileMapper_t() { -#if defined(sun) || defined(__sun) - munmap(m_data.rwData, static_cast(m_size)); -#else - munmap(reinterpret_cast(m_data.rwData), static_cast(m_size)); -#endif - close(m_fd); - } -} diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index f7af72a..a6c1b92 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -18,8 +18,6 @@ #include #include "word2vec.hpp" -//#include "wordReader.hpp" -//#include "vocabulary.hpp" #include "huffmanTree.hpp" #include "nsDistribution.hpp" #include "downSampling.hpp" diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index 0699812..1a90521 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -7,8 +7,6 @@ */ #include #include "word2vec.hpp" -#include "wordReader.hpp" -//#include "vocabulary.hpp" #include "trainer.hpp" namespace w2v {