Skip to content

Commit

Permalink
USE R RANDOMISATION AND CRAN COMPLIANCE by merging branch 'r-randomis…
Browse files Browse the repository at this point in the history
…ation'
  • Loading branch information
jwijffels committed Jan 3, 2019
2 parents ccf2c46 + 0316660 commit 3df756d
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 102 deletions.
41 changes: 27 additions & 14 deletions src/Starspace/src/data.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <Rcpp.h>
/**
* Copyright (c) 2016-present, Facebook, Inc.
* All rights reserved.
Expand Down Expand Up @@ -28,7 +29,7 @@ InternDataHandler::InternDataHandler(shared_ptr<Args> args) {
}

void InternDataHandler::errorOnZeroExample(const string& fileName) {
std::cerr << "ERROR: File '" << fileName
Rcpp::Rcerr << "ERROR: File '" << fileName
<< "' does not contain any valid example.\n"
<< "Please check: is the file empty? "
<< "Do the examples contain proper feature and label according to the trainMode? "
Expand All @@ -42,12 +43,12 @@ void InternDataHandler::loadFromFile(

ifstream fin(fileName);
if (!fin.is_open()) {
std::cerr << fileName << " cannot be opened for loading!" << std::endl;
Rcpp::Rcerr << fileName << " cannot be opened for loading!" << std::endl;
exit(EXIT_FAILURE);
}
fin.close();

cout << "Loading data from file : " << fileName << endl;
Rcpp::Rcout << "Loading data from file : " << fileName << endl;
vector<Corpus> corpora(args_->thread);
foreach_line(
fileName,
Expand All @@ -69,7 +70,7 @@ void InternDataHandler::loadFromFile(
std::copy(subcorp.begin(), subcorp.end(), examples_.begin() + destCursor);
destCursor += subcorp.size();
}
cout << "Total number of examples loaded : " << examples_.size() << endl;
Rcpp::Rcout << "Total number of examples loaded : " << examples_.size() << endl;
size_ = examples_.size();
if (size_ == 0) {
errorOnZeroExample(fileName);
Expand All @@ -94,13 +95,15 @@ void InternDataHandler::convert(
// lhs is the same, pick one random label as rhs
assert(example.LHSTokens.size() > 0);
assert(example.RHSTokens.size() > 0);
auto idx = rand() % example.RHSTokens.size();
//auto idx = rand() % example.RHSTokens.size();
unsigned int idx = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSTokens.size()));
rslt.RHSTokens.push_back(example.RHSTokens[idx]);
} else {
assert(example.RHSTokens.size() > 1);
if (args_->trainMode == 1) {
// pick one random label as rhs and the rest is lhs
auto idx = rand() % example.RHSTokens.size();
//auto idx = rand() % example.RHSTokens.size();
unsigned int idx = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSTokens.size()));
for (unsigned int i = 0; i < example.RHSTokens.size(); i++) {
auto tok = example.RHSTokens[i];
if (i == idx) {
Expand All @@ -112,7 +115,8 @@ void InternDataHandler::convert(
} else
if (args_->trainMode == 2) {
// pick one random label as lhs and the rest is rhs
auto idx = rand() % example.RHSTokens.size();
//auto idx = rand() % example.RHSTokens.size();
unsigned int idx = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSTokens.size()));
for (unsigned int i = 0; i < example.RHSTokens.size(); i++) {
auto tok = example.RHSTokens[i];
if (i == idx) {
Expand All @@ -124,10 +128,12 @@ void InternDataHandler::convert(
} else
if (args_->trainMode == 3) {
// pick two random labels, one as lhs and the other as rhs
auto idx = rand() % example.RHSTokens.size();
//auto idx = rand() % example.RHSTokens.size();
unsigned int idx = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSTokens.size()));
unsigned int idx2;
do {
idx2 = rand() % example.RHSTokens.size();
//idx2 = rand() % example.RHSTokens.size();
idx2 = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSTokens.size()));
} while (idx2 == idx);
rslt.LHSTokens.push_back(example.RHSTokens[idx]);
rslt.RHSTokens.push_back(example.RHSTokens[idx2]);
Expand Down Expand Up @@ -192,7 +198,8 @@ void InternDataHandler::getNextExample(ParseResults& rslt) {

void InternDataHandler::getRandomExample(ParseResults& rslt) const {
assert(size_ > 0);
int32_t idx = rand() % size_;
//int32_t idx = rand() % size_;
int32_t idx = static_cast<int32_t>(floor(R::runif(0, 1) * size_));
convert(examples_[idx], rslt);
}

Expand Down Expand Up @@ -234,8 +241,11 @@ void InternDataHandler::initWordNegatives() {

Base InternDataHandler::genRandomWord() const {
assert(size_ > 0);
auto& ex = examples_[rand() % size_];
int r = rand() % ex.LHSTokens.size();
unsigned int j = static_cast<unsigned int>(floor(R::runif(0, 1) * size_));
//auto& ex = examples_[rand() % size_];
auto& ex = examples_[j];
//int r = rand() % ex.LHSTokens.size();
int r = static_cast<int>(floor(R::runif(0, 1) * ex.LHSTokens.size()));
return ex.LHSTokens[r];
}

Expand All @@ -244,8 +254,11 @@ Base InternDataHandler::genRandomWord() const {
void InternDataHandler::getRandomRHS(vector<Base>& results) const {
assert(size_ > 0);
results.clear();
auto& ex = examples_[rand() % size_];
unsigned int r = rand() % ex.RHSTokens.size();
//auto& ex = examples_[rand() % size_];
unsigned int j = static_cast<unsigned int>(floor(R::runif(0, 1) * size_));
auto& ex = examples_[j];
//unsigned int r = rand() % ex.RHSTokens.size();
unsigned int r = static_cast<unsigned int>(floor(R::runif(0, 1) * ex.RHSTokens.size()));
if (args_->trainMode == 2) {
for (unsigned int i = 0; i < ex.RHSTokens.size(); i++) {
if (i != r) {
Expand Down
23 changes: 12 additions & 11 deletions src/Starspace/src/dict.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <Rcpp.h>
/**
* Copyright (c) 2016-present, Facebook, Inc.
* All rights reserved.
Expand Down Expand Up @@ -135,10 +136,10 @@ void Dictionary::readFromFile(
const std::string& file,
shared_ptr<DataParser> parser) {

cout << "Build dict from input file : " << file << endl;
Rcpp::Rcout << "Build dict from input file : " << file << endl;
ifstream fin(file);
if (!fin.is_open()) {
cerr << "Input file cannot be opened!" << endl;
Rcpp::Rcerr << "Input file cannot be opened!" << endl;
exit(EXIT_FAILURE);
}
int64_t minThreshold = 1;
Expand All @@ -151,7 +152,7 @@ void Dictionary::readFromFile(
for (auto token : tokens) {
insert(token);
if ((ntokens_ % 1000000 == 0) && args_->verbose) {
std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush;
Rcpp::Rcerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush;
}
if (size_ > 0.75 * MAX_VOCAB_SIZE) {
minThreshold++;
Expand All @@ -163,15 +164,15 @@ void Dictionary::readFromFile(

threshold(args_->minCount, args_->minCountLabel);

std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::endl;
std::cerr << "Number of words in dictionary: " << nwords_ << std::endl;
std::cerr << "Number of labels in dictionary: " << nlabels_ << std::endl;
Rcpp::Rcout << "\rRead " << ntokens_ / 1000000 << "M words" << std::endl;
Rcpp::Rcout << "Number of words in dictionary: " << nwords_ << std::endl;
Rcpp::Rcout << "Number of labels in dictionary: " << nlabels_ << std::endl;
if (lines_read == 0) {
std::cerr << "ERROR: Empty file." << std::endl;
Rcpp::Rcerr << "ERROR: Empty file." << std::endl;
exit(EXIT_FAILURE);
}
if (size_ == 0) {
std::cerr << "Empty vocabulary. Try a smaller -minCount value."
Rcpp::Rcerr << "Empty vocabulary. Try a smaller -minCount value."
<< std::endl;
exit(EXIT_FAILURE);
}
Expand Down Expand Up @@ -209,7 +210,7 @@ void Dictionary::computeCounts() {

// Given a model saved in .tsv format, build the dictionary from model.
void Dictionary::loadDictFromModel(const string& modelfile) {
cout << "Loading dict from model file : " << modelfile << endl;
Rcpp::Rcout << "Loading dict from model file : " << modelfile << endl;
ifstream fin(modelfile);
string line;
while (getline(fin, line)) {
Expand All @@ -221,8 +222,8 @@ void Dictionary::loadDictFromModel(const string& modelfile) {
fin.close();
computeCounts();

std::cout << "Number of words in dictionary: " << nwords_ << std::endl;
std::cout << "Number of labels in dictionary: " << nlabels_ << std::endl;
Rcpp::Rcout << "Number of words in dictionary: " << nwords_ << std::endl;
Rcpp::Rcout << "Number of labels in dictionary: " << nlabels_ << std::endl;
}

} // namespace
43 changes: 32 additions & 11 deletions src/Starspace/src/doc_data.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <Rcpp.h>
/**
* Copyright (c) 2016-present, Facebook, Inc.
* All rights reserved.
Expand Down Expand Up @@ -31,12 +32,12 @@ void LayerDataHandler::loadFromFile(

ifstream fin(fileName);
if (!fin.is_open()) {
std::cerr << fileName << " cannot be opened for loading!" << std::endl;
Rcpp::Rcerr << fileName << " cannot be opened for loading!" << std::endl;
exit(EXIT_FAILURE);
}
fin.close();

cout << "Loading data from file : " << fileName << endl;
Rcpp::Rcout << "Loading data from file : " << fileName << endl;
vector<Corpus> corpora(args_->thread);
foreach_line(
fileName,
Expand All @@ -58,7 +59,7 @@ void LayerDataHandler::loadFromFile(
std::copy(subcorp.begin(), subcorp.end(), examples_.begin() + destCursor);
destCursor += subcorp.size();
}
cout << "Total number of examples loaded : " << examples_.size() << endl;
Rcpp::Rcout << "Total number of examples loaded : " << examples_.size() << endl;
size_ = examples_.size();
if (size_ == 0) {
errorOnZeroExample(fileName);
Expand All @@ -75,6 +76,7 @@ void LayerDataHandler::insert(
rslt.insert(rslt.end(), ex.begin(), ex.end());
} else {
// dropout enabled
/*
auto rnd = [&] {
static thread_local unsigned int rState;
Expand All @@ -92,8 +94,10 @@ void LayerDataHandler::insert(
rand_r(&rState);
#endif
};
*/
for (const auto& it : ex) {
auto p = (double)(rnd()) / RAND_MAX;
//auto p = (double)(rnd()) / RAND_MAX;
double p = R::runif(0, 1);
if (p > dropout) {
rslt.push_back(it);
}
Expand All @@ -110,7 +114,8 @@ void LayerDataHandler::getWordExamples(
assert(example.RHSFeatures.size() > 0);

// take one random sentence and train on word
auto r = rand() % example.RHSFeatures.size();
//auto r = rand() % example.RHSFeatures.size();
unsigned int r = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSFeatures.size()));
InternDataHandler::getWordExamples(example.RHSFeatures[r], rslts);
}

Expand All @@ -126,13 +131,15 @@ void LayerDataHandler::convert(
assert(example.LHSTokens.size() > 0);
assert(example.RHSFeatures.size() > 0);
insert(rslt.LHSTokens, example.LHSTokens, args_->dropoutLHS);
auto idx = rand() % example.RHSFeatures.size();
//auto idx = rand() % example.RHSFeatures.size();
unsigned int idx = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSFeatures.size()));
insert(rslt.RHSTokens, example.RHSFeatures[idx], args_->dropoutRHS);
} else {
assert(example.RHSFeatures.size() > 1);
if (args_->trainMode == 1) {
// pick one random rhs as label, the rest becomes lhs features
auto idx = rand() % example.RHSFeatures.size();
//auto idx = rand() % example.RHSFeatures.size();
unsigned int idx = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSFeatures.size()));
for (unsigned int i = 0; i < example.RHSFeatures.size(); i++) {
if (i == idx) {
insert(rslt.RHSTokens, example.RHSFeatures[i], args_->dropoutRHS);
Expand All @@ -143,7 +150,8 @@ void LayerDataHandler::convert(
} else
if (args_->trainMode == 2) {
// pick one random rhs as lhs, the rest becomes rhs features
auto idx = rand() % example.RHSFeatures.size();
//auto idx = rand() % example.RHSFeatures.size();
unsigned int idx = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSFeatures.size()));
for (unsigned int i = 0; i < example.RHSFeatures.size(); i++) {
if (i == idx) {
insert(rslt.LHSTokens, example.RHSFeatures[i], args_->dropoutLHS);
Expand All @@ -154,12 +162,14 @@ void LayerDataHandler::convert(
} else
if (args_->trainMode == 3) {
// pick one random rhs as input
auto idx = rand() % example.RHSFeatures.size();
//auto idx = rand() % example.RHSFeatures.size();
unsigned int idx = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSFeatures.size()));
insert(rslt.LHSTokens, example.RHSFeatures[idx], args_->dropoutLHS);
// pick another random rhs as label
unsigned int idx2;
do {
idx2 = rand() % example.RHSFeatures.size();
//idx2 = rand() % example.RHSFeatures.size();
idx2 = static_cast<unsigned int>(floor(R::runif(0, 1) * example.RHSFeatures.size()));
} while (idx == idx2);
insert(rslt.RHSTokens, example.RHSFeatures[idx2], args_->dropoutRHS);
} else
Expand All @@ -174,16 +184,27 @@ void LayerDataHandler::convert(
// generate a random word from examples
Base LayerDataHandler::genRandomWord() const {
assert(size_ > 0);
/*
auto& ex = examples_[rand() % size_];
int r = rand() % ex.RHSFeatures.size();
int wid = rand() % ex.RHSFeatures[r].size();
*/
unsigned int j = static_cast<unsigned int>(floor(R::runif(0, 1) * size_));
auto& ex = examples_[j];
int r = static_cast<int>(floor(R::runif(0, 1) * ex.RHSFeatures.size()));
int wid = static_cast<int>(floor(R::runif(0, 1) * ex.RHSFeatures[r].size()));
return ex.RHSFeatures[r][wid];
}

void LayerDataHandler::getRandomRHS(vector<Base>& result) const {
assert(size_ > 0);
/*
auto& ex = examples_[rand() % size_];
unsigned int r = rand() % ex.RHSFeatures.size();
*/
unsigned int j = static_cast<unsigned int>(floor(R::runif(0, 1) * size_));
auto& ex = examples_[j];
unsigned int r = static_cast<unsigned int>(floor(R::runif(0, 1) * ex.RHSFeatures.size()));

result.clear();
if (args_->trainMode == 2) {
Expand All @@ -206,7 +227,7 @@ void LayerDataHandler::save(ostream& out) {
}
out << "\nrhs: ";
for (auto feat : example.RHSFeatures) {
for (auto r : feat) { cout << r.first << ':' << r.second << ' '; }
for (auto r : feat) { Rcpp::Rcout << r.first << ':' << r.second << ' '; }
out << "\t";
}
out << endl;
Expand Down
2 changes: 1 addition & 1 deletion src/Starspace/src/doc_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace starspace {
LayerDataParser::LayerDataParser(
shared_ptr<Dictionary> dict,
shared_ptr<Args> args)
: DataParser(dict, args) {};
: DataParser(dict, args) {}

bool LayerDataParser::parse(
string& s,
Expand Down
Loading

0 comments on commit 3df756d

Please sign in to comment.