Skip to content

Commit

Permalink
Performance: do not use R unserialization to retrieve info about SEXP
Browse files Browse the repository at this point in the history
R serialization is not thread safe, and allocates.

Sometimes, we just need to look at the byte serialization of the value and find out facts about it.
This creates a sexp_view type with pointers to relevant parts of the serialized SEXP.
  • Loading branch information
programLyrique committed Jan 28, 2022
1 parent 7c147ed commit 34169d1
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 6 deletions.
13 changes: 8 additions & 5 deletions src/search_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,16 @@ const std::vector<std::pair<std::string, roaring::Roaring64Map>> SearchIndex::bu
results.push_back({"na_index",roaring::Roaring64Map()});



for(uint64_t i = start; i < end ; i++) {
std::vector<std::byte> buf = db.sexp_table.read(i);
SEXP val = PROTECT(db.ser.unserialize(buf));// That might not be thread-safe
const std::vector<std::byte>& buf = db.sexp_table.read(i);

const sexp_view_t sexp_view = Serializer::unserialize_view(buf);

if(find_na(val)) {
if(find_na(sexp_view)) {
results[0].second.add(i);
}

UNPROTECT(1);
}

for(auto& result : results) {
Expand Down Expand Up @@ -165,7 +166,9 @@ void SearchIndex::build_indexes(const Database& db) {

//Parallelize on the 3 independent files
// without std::cref, would actually copy!
std::future<const std::vector<std::pair<std::string, roaring::Roaring64Map>>> results_value_fut = std::async( std::launch::async, build_indexes_values, std::cref(db), last_computed, db.nb_values());
// In another thread, does not work!
// unserialize tries to allocate, probably
std::future<const std::vector<std::pair<std::string, roaring::Roaring64Map>>> results_value_fut = std::async( std::launch::deferred, build_indexes_values, std::cref(db), last_computed, db.nb_values());

std::future<const std::vector<std::pair<std::string, roaring::Roaring64Map>>> results_meta_fut = std::async( std::launch::async, build_indexes_static_meta, std::cref(db), last_computed, db.nb_values());

Expand Down
60 changes: 60 additions & 0 deletions src/search_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "config.h"

#include "reverse_index.h"
#include "serialization.h"

class Database;

Expand Down Expand Up @@ -91,6 +92,65 @@ inline bool find_na(SEXP val) {
return false;
}

// This version works on the array of bytes directly
inline bool find_na(const sexp_view_t& sexp_view) {
size_t length = sexp_view.length;

switch(sexp_view.type) {
case LGLSXP: {
const int* v = static_cast<const int*>(sexp_view.data);
#ifdef SXPDB_PARALLEL_STD
return std::find(std::execution::par_unseq, v, v + length, NA_LOGICAL) != v +length;
#else
return std::find(v, v + length, NA_LOGICAL) != v + length;
#endif
}
case INTSXP: {
const int* v = static_cast<const int*>(sexp_view.data);
#ifdef SXPDB_PARALLEL_STD
return std::find(std::execution::par_unseq, v, v + length, NA_INTEGER) != v + length;
#else
return std::find(v, v + length, NA_INTEGER) != v + length;
#endif
}
case REALSXP: {
const double* v = static_cast<const double*>(sexp_view.data);
#ifdef SXPDB_PARALLEL_STD
return std::find_if(std::execution::par_unseq, v, v + length, [](double d) -> bool {return ISNAN(d) ;}) != v + length;
#else
return std::find_if( v, v + length, [](double d) -> bool {return ISNAN(d) ;}) != v + length;
#endif
}
case CPLXSXP: {
const Rcomplex* v = static_cast<const Rcomplex*>(sexp_view.data);
#ifdef SXPDB_PARALLEL_STD
return std::find_if(std::execution::par_unseq, v, v + length, [](const Rcomplex& c) -> bool {return ISNAN(c.r) || ISNAN(c.i);}) != v + length;
#else
return std::find_if(v, v + length, [](const Rcomplex& c) -> bool {return ISNAN(c.r) || ISNAN(c.i);}) != v + length;
#endif
}
case STRSXP: {
// This one is more complex has it stores CHARSXP which do not have the same length
const char* data = static_cast<const char*>(sexp_view.data);
int size = 0;
for(size_t i = 0; i < sexp_view.length; i++) {
std::memcpy(&size, data, sizeof(int));
data += sizeof(int);
if(size == -1) {// this is NA_STRING
return true;
}
assert(size > 0);
//else we jump to the next CHARSXP
data += length;
}
return false;
}

}

return false;
}

class SearchIndex {
friend class Query;
public:
Expand Down
73 changes: 73 additions & 0 deletions src/serialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,79 @@ std::byte* Serializer::jump_header(std::vector<std::byte>& buffer) {
return reinterpret_cast<std::byte*>(buf);
}

const char* read_length(const char* buf, size_t& size) {
int len = 0;
std::memcpy(&len, buf, sizeof(int));
buf += sizeof(int);
if(len == -1) {
unsigned int len1, len2;
std::memcpy(&len1, buf, sizeof(int));
buf += sizeof(int);
std::memcpy(&len2, buf, sizeof(int));
buf += sizeof(int);
size = (((size_t) len1) << 32) + len2;
}
else {
size = len;
}

return buf;
}

const sexp_view_t Serializer::unserialize_view(const std::vector<std::byte>& buf) {
const char* data = reinterpret_cast<const char*>(buf.data());

sexp_view_t sexp_view;

// the header is already not there anymore

int flags = 0;
std::memcpy(&flags, data, sizeof(int));
sexp_view.type = flags & 255;
bool has_attr = flags & (1 << 9);

data += sizeof(int);

// We handle only vector types
switch(sexp_view.type) {
case LGLSXP:
case INTSXP:
case REALSXP:
case CPLXSXP:
case STRSXP:
data = read_length(data, sexp_view.length);
break;
default:
return sexp_view;
}


switch(sexp_view.type) {
case LGLSXP:
case INTSXP:
sexp_view.data = data;
sexp_view.element_size = sizeof(int);
break;
case REALSXP:
sexp_view.data = data;
sexp_view.element_size = sizeof(double);
break;
case CPLXSXP:
sexp_view.data = data;
sexp_view.element_size = sizeof(Rcomplex);
break;
case STRSXP:
sexp_view.data = data;
// the elements are CHARSXP and have variable size
//it is easy to find NA though:
// read the length, if is -1, it is NA< otherwise, jump length to the next item
break;
}


return sexp_view;
}

void Serializer::append_byte(R_outpstream_t stream, int c) { //add ints, not chars??
WriteBuffer* wbf = static_cast<WriteBuffer*>(stream->data);

Expand Down
12 changes: 12 additions & 0 deletions src/serialization.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
#include <Rinternals.h>
#include <Rdefines.h>

struct sexp_view_t {
SEXPTYPE type = ANYSXP;
const void* data = nullptr;
size_t length = 0;
size_t element_size = 0;
};

// Not static
// Will make it easier to parallelize (one serializer per thread)
class Serializer {
Expand Down Expand Up @@ -54,6 +61,9 @@ class Serializer {
static std::byte* jump_header(std::vector<std::byte>& buf);





inline static std::array<char, 23> header = {'B', '\n', 3, 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0, 5, 0, 0, 0, 'U', 'T', 'F', '-', '8'};

public:
Expand All @@ -68,6 +78,8 @@ class Serializer {

// Analyzes a RDS serialization header
static SEXP analyze_header(std::vector<std::byte>& buf);
// Get a view of the data, that does not require allocating
static const sexp_view_t unserialize_view(const std::vector<std::byte>& buf);
};

#endif
8 changes: 8 additions & 0 deletions src/sxpdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,14 @@ SEXP sample_similar(SEXP sxpdb, SEXP vals, SEXP multiple, SEXP relax) {
d.relax_vector();
d.relax_length();
}
else if(relax_param == "keep_class") {
d.relax_attributes();
d.relax_na();
d.relax_ndims();
d.relax_vector();
d.relax_length();
d.relax_type();
}
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/sxpdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ SEXP sample_val(SEXP db);
* @param db external pointer to the database
* @param val a SEXP
* @param multiple boolean, whether val is actual value or a list of values we will do the union of
* @param relax character vector none or several of "na", "length", "attributes", "type", "vector", "ndims", "class".
* @param relax character vector none or several of "na", "length", "attributes", "type", "vector", "ndims", "class". You can
* also give "keep_type" or "keep_class" to relax on all constraints, except the type, or except the class names.
* It will relax the given constraints inferred from the example value.
* @return R value in form of SEXP from the database, R_NilValue if no similar value was found
*/
Expand Down

0 comments on commit 34169d1

Please sign in to comment.