Skip to content

Commit

Permalink
Optimize SemistaticMap using shallow copy semantics to share data wit…
Browse files Browse the repository at this point in the history
…h the NormalizedComponent. Decreases injection time by ~5%.
  • Loading branch information
poletti-marco committed Nov 9, 2014
1 parent 050d600 commit f928a00
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 190 deletions.
2 changes: 1 addition & 1 deletion include/fruit/impl/data_structures/semistatic_graph.defn.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ inline typename SemistaticGraph<NodeId, Node>::const_node_iterator SemistaticGra

template <typename NodeId, typename Node>
inline typename SemistaticGraph<NodeId, Node>::node_iterator SemistaticGraph<NodeId, Node>::find(NodeId nodeId) {
InternalNodeId* internalNodeIdPtr = nodeIndexMap.find(nodeId);
const InternalNodeId* internalNodeIdPtr = nodeIndexMap.find(nodeId);
if (internalNodeIdPtr == nullptr) {
return node_iterator{nodes.end()};
} else {
Expand Down
12 changes: 10 additions & 2 deletions include/fruit/impl/data_structures/semistatic_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ namespace impl {
// The alignas ensures that a SemistaticGraphInternalNodeId* always has 0 in the low-order bit.
struct alignas(2) alignas(alignof(std::size_t)) SemistaticGraphInternalNodeId {
std::size_t id;

bool operator==(const SemistaticGraphInternalNodeId& x) const {
return id == x.id;
}

bool operator<(const SemistaticGraphInternalNodeId& x) const {
return id < x.id;
}
};

/**
Expand Down Expand Up @@ -164,7 +172,7 @@ class SemistaticGraph {
SemistaticGraph(NodeIter first, NodeIter last);

SemistaticGraph(SemistaticGraph&&) = default;
SemistaticGraph(const SemistaticGraph&) = default;
SemistaticGraph(const SemistaticGraph&) = delete;

// Creates a copy of x with the additional nodes in [first, last). The requirements on NodeIter as the same as for the 2-arg
// constructor.
Expand All @@ -174,7 +182,7 @@ class SemistaticGraph {
template <typename NodeIter>
SemistaticGraph(const SemistaticGraph& x, NodeIter first, NodeIter last);

SemistaticGraph& operator=(const SemistaticGraph&) = default;
SemistaticGraph& operator=(const SemistaticGraph&) = delete;
SemistaticGraph& operator=(SemistaticGraph&&) = default;

node_iterator end();
Expand Down
48 changes: 23 additions & 25 deletions include/fruit/impl/data_structures/semistatic_graph.templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,34 +151,43 @@ template <typename NodeId, typename Node>
template <typename NodeIter>
SemistaticGraph<NodeId, Node>::SemistaticGraph(const SemistaticGraph& x, NodeIter first, NodeIter last)
// TODO: Do a shallow copy of the index map too.
: nodeIndexMap(x.nodeIndexMap), firstUnusedIndex(x.firstUnusedIndex), nodes(x.nodes) {
: firstUnusedIndex(x.firstUnusedIndex), nodes(x.nodes) {

// TODO: The code below is very similar to the other constructor, extract the common parts in separate functions.

std::size_t num_new_edges = 0;

// Step 1: assign IDs to new nodes, fill `nodeIndexMap' and update `firstUnusedIndex'.
std::unordered_set<NodeId> nodeIds;

// Step 1a: collect all new node IDs.
std::vector<std::pair<NodeId, InternalNodeId>> nodeIds;
for (NodeIter i = first; i != last; ++i) {
++firstUnusedIndex;
nodeIndexMap.insert(i->getId(), InternalNodeId{firstUnusedIndex - 1}, [this](InternalNodeId x, InternalNodeId) {
// There was already an index for this TypeId, we don't need to allocate an index after all.
--firstUnusedIndex;
return x;
});
if (x.nodeIndexMap.find(i->getId()) == nullptr) {
nodeIds.push_back(std::make_pair(i->getId(), InternalNodeId()));
}
if (!i->isTerminal()) {
for (auto j = i->getEdgesBegin(); j != i->getEdgesEnd(); ++j) {
++firstUnusedIndex;
nodeIndexMap.insert(*j, InternalNodeId{firstUnusedIndex - 1}, [this](InternalNodeId x, InternalNodeId) {
// There was already an index for this TypeId, we don't need to allocate an index after all.
--firstUnusedIndex;
return x;
});
if (x.nodeIndexMap.find(*j) == nullptr) {
nodeIds.push_back(std::make_pair(*j, InternalNodeId()));
}
++num_new_edges;
}
}
}

// Step 1b: remove duplicates.
std::sort(nodeIds.begin(), nodeIds.end());
nodeIds.erase(std::unique(nodeIds.begin(), nodeIds.end()), nodeIds.end());

// Step 1c: assign new IDs.
for (auto& p : nodeIds) {
p.second = InternalNodeId{firstUnusedIndex};
++firstUnusedIndex;
}

// Step 1d: actually populate nodeIndexMap.
nodeIndexMap = SemistaticMap<NodeId, InternalNodeId>(x.nodeIndexMap, std::move(nodeIds));

// Step 2: fill `nodes' and `edgesStorage'

// Note that not all of these will be assigned in the loop below.
Expand All @@ -188,17 +197,6 @@ SemistaticGraph<NodeId, Node>::SemistaticGraph(const SemistaticGraph& x, NodeIte
#endif
Node(), 1});

#ifdef FRUIT_EXTRA_DEBUG
{
std::cerr << "SemistaticGraph constructed with the following known types:" << std::endl;
std::size_t i = 0;
for (typename std::unordered_set<NodeId>::iterator itr = nodeIds.begin(); itr != nodeIds.end(); ++i, ++itr) {
nodes[i].key = *itr;
std::cerr << i << ": " << *itr << std::endl;
}
}
#endif

// edgesStorage[0] is unused, that's the reason for the +1
edgesStorage.reserve(num_new_edges + 1);
edgesStorage.resize(1);
Expand Down
35 changes: 22 additions & 13 deletions include/fruit/impl/data_structures/semistatic_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class SemistaticMap {
private:
using Unsigned = std::uintptr_t;
using NumBits = unsigned char;
using value_type = std::pair<Key, Value>;

static const unsigned char beta = 4;

Expand All @@ -62,42 +63,50 @@ class SemistaticMap {
}

HashFunction hash_function;
// Given a key x, the candidate places for x are keys[lookup_table[hash_function.hash(x)]] and the following cells that hash to the same value.
std::vector<Unsigned> lookup_table;
std::vector<std::pair<Key, Value>> values;
// Given a key x, if p=lookup_table[hash_function.hash(x)] the candidate places for x are [p.first, p.second). These pointers
// point to the values[] vector, but it might be either the one of this object or the one of an object that was shallow-copied
// into this one.
std::vector<std::pair<value_type*, value_type*>> lookup_table;
std::vector<value_type> values;

inline Unsigned hash(const Key& key) const {
return hash_function.hash(std::hash<typename std::remove_cv<Key>::type>()(key));
}

// Inserts a range [elemsBegin, elemsEnd) of new (key,value) pairs with hash h. The keys must not exist in the map.
// Before calling this, ensure that the capacity of `values' is sufficient to contain the new values without re-allocating.
void insert(std::size_t h,
typename std::vector<value_type>::const_iterator elemsBegin,
typename std::vector<value_type>::const_iterator elemsEnd);

public:
// Constructs an *invalid* map (as if this map was just moved from).
SemistaticMap() = default;

// Iter must be a forward iterator with value type std::pair<Key, Value>.
// This constructor is *not* defined in semistatic_map.templates.h, but only in semistatic_map.cc.
// All instantiations must provide an extern template declaration and have a matching instantiation in semistatic_map.cc.
template <typename Iter>
SemistaticMap(Iter begin, std::size_t num_values);

SemistaticMap(const SemistaticMap&) = default;
// Creates a shallow copy of `map' with the additional elements in newElements.
// The keys in newElements must be unique and must not be present in `map'.
// The new map will share data with `map', so must be destroyed before `map' is destroyed.
// NOTE: If more than O(1) elements are added, calls to at() and find() on the result will *not* be O(1).
// This is O(newElements.size()*log(newElements.size())).
SemistaticMap(const SemistaticMap<Key, Value>& map, std::vector<value_type>&& newElements);

SemistaticMap(SemistaticMap&&) = default;
SemistaticMap(const SemistaticMap&) = delete;

SemistaticMap& operator=(const SemistaticMap&) = default;
SemistaticMap& operator=(SemistaticMap&&) = default;
SemistaticMap& operator=(const SemistaticMap&) = delete;

// Precondition: `key' must exist in the map.
// Unlike std::map::at(), this yields undefined behavior if the precondition isn't satisfied (instead of throwing).
Value& at(Key key);
const Value& at(Key key) const;

// Prefer using at() when possible, this is slightly slower.
// Returns nullptr if the key was not found.
const Value* find(Key key) const;
Value* find(Key key);

// Inserts (key, value). If `key' already exists, inserts (key, combine(oldValue, (*this)[key])) instead.
template <typename Combine>
void insert(Key key, Value value, Combine combine);
};

} // namespace impl
Expand Down
132 changes: 73 additions & 59 deletions include/fruit/impl/data_structures/semistatic_map.templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,43 +33,67 @@ namespace fruit {
namespace impl {

template <typename Key, typename Value>
template <typename Combine>
void SemistaticMap<Key, Value>::insert(Key key, Value value, Combine combine) {
Unsigned h = hash(key);
Unsigned old_keys_size = values.size();
Unsigned first_candidate_index = lookup_table[h];
Unsigned last_candidate_index = old_keys_size;

{
Unsigned i = first_candidate_index;
for (; i != last_candidate_index; ++i) {
if (values[i].first == key) {
values[i].second = combine(values[i].second, value);
return;
}
Unsigned h1 = hash(values[i].first);
if (h1 != h) {
break;
}
}
last_candidate_index = i;
// Now [first_candidate_index, last_candidate_index) contains only keys that hash to h.
}
void SemistaticMap<Key, Value>::insert(std::size_t h,
typename std::vector<value_type>::const_iterator elemsBegin,
typename std::vector<value_type>::const_iterator elemsEnd) {

value_type* oldBucketBegin = lookup_table[h].first;
value_type* oldBucketEnd = lookup_table[h].second;

// `key' is not in `keys'.
lookup_table[h].first = values.data() + values.size();

// Step 1: re-insert all keys with the same hash at the end (if any).
for (Unsigned i = first_candidate_index; i != last_candidate_index; ++i) {
// The copies make sure that the references passed to push_back dont't get invalidated by resizing.
values.emplace_back(Key(values[i].first), Value(values[i].second));
for (value_type* p = oldBucketBegin; p != oldBucketEnd; ++p) {
values.push_back(*p);
}

// Step 2: also insert the new keys and values
for (typename std::vector<value_type>::const_iterator itr = elemsBegin; itr != elemsEnd; ++itr) {
values.push_back(*itr);
}

// Step 2: also insert the new key and value
values.emplace_back(key, value);
lookup_table[h].second = values.data() + values.size();

// Step 3: update the index in the lookup table to point to the newly-added sequence.
// The old sequence is no longer pointed to by any index in the lookup table, but recompacting the vectors would be too slow.
lookup_table[h] = old_keys_size;
}

template <typename Key, typename Value>
SemistaticMap<Key, Value>::SemistaticMap(const SemistaticMap<Key, Value>& map, std::vector<value_type>&& newElements)
: hash_function(map.hash_function), lookup_table(map.lookup_table) {

// Sort by hash.
std::sort(newElements.begin(), newElements.end(), [this](const value_type& x, const value_type& y) {
return hash(x.first) < hash(y.first);
});

std::size_t additionalValues = newElements.size();
// Add the space needed to store copies of the old buckets.
for (typename std::vector<value_type>::iterator itr = newElements.begin(), itr_end = newElements.end();
itr != itr_end;
/* no increment */) {
Unsigned h = hash(itr->first);
auto p = map.lookup_table[h];
additionalValues += (p.second - p.first);
for (; itr != itr_end && hash(itr->first) == h; ++itr) {
}
}

values.reserve(additionalValues);

// Now actually perform the insertions.

for (typename std::vector<value_type>::iterator itr = newElements.begin(), itr_end = newElements.end();
itr != itr_end;
/* no increment */) {
Unsigned h = hash(itr->first);
auto p = map.lookup_table[h];
additionalValues += (p.second - p.first);
typename std::vector<value_type>::iterator first = itr;
for (; itr != itr_end && hash(itr->first) == h; ++itr) {
}
typename std::vector<value_type>::iterator last = itr;
insert(h, first, last);
}
}

template <typename Key, typename Value>
Expand Down Expand Up @@ -104,54 +128,44 @@ SemistaticMap<Key, Value>::SemistaticMap(Iter valuesBegin, std::size_t num_value
pick_another:;
}

std::partial_sum(count.begin(), count.end(), count.begin());
lookup_table = std::move(count);
values.resize(num_values);

std::partial_sum(count.begin(), count.end(), count.begin());
lookup_table.reserve(count.size());
for (Unsigned n : count) {
lookup_table.push_back(make_pair(values.data() + n, values.data() + n));
}

// At this point lookup_table[h] is the number of keys in [first, last) that have a hash <=h.
// Note that even though we ensure this after construction, it is not maintained by insert() so it's not an invariant.

Iter itr = valuesBegin;
for (std::size_t i = 0; i < num_values; ++i, ++itr) {
Unsigned& cell = lookup_table[hash((*itr).first)];
--cell;
assert(cell < num_values);
values[cell] = *itr;
value_type*& firstValuePtr = lookup_table[hash((*itr).first)].first;
--firstValuePtr;
assert(values.data() <= firstValuePtr);
assert(firstValuePtr < values.data() + values.size());
*firstValuePtr = *itr;
}
}

template <typename Key, typename Value>
Value& SemistaticMap<Key, Value>::at(Key key) {
const Value& SemistaticMap<Key, Value>::at(Key key) const {
Unsigned h = hash(key);
Unsigned i = lookup_table[h];
while (true) {
assert(i < values.size());
if (values[i].first == key) {
return values[i].second;
for (const value_type* p = lookup_table[h].first; /* p!=lookup_table[h].second but no need to check */; ++p) {
assert(p != lookup_table[h].second);
if (p->first == key) {
return p->second;
}
assert(hash(values[i].first) == h);
++i;
}
}

template <typename Key, typename Value>
Value* SemistaticMap<Key, Value>::find(Key key) {
const SemistaticMap<Key, Value>* cthis = this;
return const_cast<Value*>(cthis->find(key));
}

template <typename Key, typename Value>
const Value* SemistaticMap<Key, Value>::find(Key key) const {
Unsigned h = hash(key);
Unsigned first_candidate_index = lookup_table[h];
Unsigned last_candidate_index = values.size();
for (Unsigned i = first_candidate_index; i != last_candidate_index; ++i) {
if (values[i].first == key) {
return &(values[i].second);
}
Unsigned h1 = hash(values[i].first);
if (h1 != h) {
break;
for (const value_type *p = lookup_table[h].first, *p_end = lookup_table[h].second; p != p_end; ++p) {
if (p->first == key) {
return &(p->second);
}
}
return nullptr;
Expand Down
Loading

0 comments on commit f928a00

Please sign in to comment.