From 52dcdf3e483ec4ed4cb6ad1e91ba432c593b4f36 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 25 Apr 2020 17:09:51 +0200 Subject: [PATCH] trade space for time and store edges in their own hacked vector --- src/node.cpp | 100 ++++++++++++++++++++------------------------------- src/node.hpp | 20 ++++------- src/odgi.cpp | 14 ++++---- 3 files changed, 52 insertions(+), 82 deletions(-) diff --git a/src/node.cpp b/src/node.cpp index dbb40e2e..8ab6ccab 100644 --- a/src/node.cpp +++ b/src/node.cpp @@ -5,55 +5,33 @@ namespace odgi { uint64_t node_t::sequence_size(void) const { - return seq_bytes(); + return sequence.size(); } -const std::string node_t::sequence(void) const { - const std::string res((char*)bytes.data()+seq_start(), seq_bytes()); - return res; +const std::string node_t::get_sequence(void) const { + return sequence; } void node_t::set_sequence(const std::string& seq) { - if (seq.size() > seq_bytes()) { - bytes.reserve(bytes.size()+seq.size()-seq_bytes()); - bytes.insert(bytes.begin()+seq_start(), seq.size() - seq_bytes(), 0); - set_seq_bytes(seq.size()); - } else if (seq.size() < seq_bytes()) { - bytes.erase(bytes.begin()+seq_start(), bytes.begin()+seq_start()+(seq_bytes()-seq.size()));; - set_seq_bytes(seq.size()); - } - memcpy(bytes.data()+seq_start(), seq.c_str(), seq.size()); + sequence = seq; } -std::vector node_t::edges(void) const { - std::vector res; - if (edge_count()) { - res.resize(edge_count()*EDGE_RECORD_LENGTH); - sqvarint::decode(res.data(), - (uint8_t*)bytes.data()+edge_start(), - edge_count()*EDGE_RECORD_LENGTH); - } - return res; +const dyn::hacked_vector& node_t::get_edges(void) const { + return edges; } void node_t::add_edge(const uint64_t& relative_id, const uint64_t& edge_type) { //std::cerr << "add edge " << "relative_id " << relative_id << " edge_type " << edge_type << std::endl; - uint64_t add_edge_bytes = sqvarint::length({relative_id, edge_type}); - bytes.reserve(bytes.size()+add_edge_bytes); - bytes.insert(bytes.begin()+edge_start(), add_edge_bytes, 0); - sqvarint::encode({relative_id, edge_type}, bytes.data()+edge_start()); - set_edge_bytes(edge_bytes() + add_edge_bytes); - set_edge_count(edge_count() + 1); + edges.push_back(relative_id); + edges.push_back(edge_type); } void node_t::remove_edge(const uint64_t& rank) { assert(rank < edge_count()); - uint64_t edge_offset = edge_start() + sqvarint::bytes(bytes.data()+edge_start(), EDGE_RECORD_LENGTH*rank); - // a bit redundant - uint64_t j = sqvarint::bytes(bytes.data()+edge_offset, EDGE_RECORD_LENGTH); - bytes.erase(bytes.begin()+edge_offset, bytes.begin()+edge_offset+j); - set_edge_count(edge_count()-1); - set_edge_bytes(edge_bytes()-j); + uint64_t offset = EDGE_RECORD_LENGTH*rank; + for (uint8_t i = 0; i < EDGE_RECORD_LENGTH; ++i) { + edges.remove(offset); + } } void node_t::add_path_step(const uint64_t& path_id, const bool& is_rev, @@ -139,13 +117,16 @@ void node_t::remove_path_step(const uint64_t& rank) { } void node_t::clear(void) { - set_seq_bytes(0); - set_edge_bytes(0); - set_edge_count(0); - bytes.clear(); + sequence.clear(); + clear_edges(); clear_path_steps(); } +void node_t::clear_edges(void) { + dyn::hacked_vector null_iv; + edges = null_iv; +} + void node_t::clear_path_steps(void) { dyn::hacked_vector null_iv; path_steps = null_iv; @@ -153,42 +134,37 @@ void node_t::clear_path_steps(void) { uint64_t node_t::serialize(std::ostream& out) const { uint64_t written = 0; - out.write((char*)&_seq_bytes, sizeof(uint32_t)); - out.write((char*)&_edge_bytes, sizeof(uint32_t)); - out.write((char*)&_edge_count, sizeof(uint32_t)); - written += sizeof(uint32_t)*4 + sizeof(uint8_t); - uint64_t node_size = bytes.size(); - out.write((char*)&node_size, sizeof(node_size)); - written += sizeof(uint64_t); - out.write((char*)bytes.data(), node_size*sizeof(uint8_t)); - written += sizeof(uint8_t)*node_size; + size_t seq_size = sequence.size(); + out.write((char*)&seq_size, sizeof(size_t)); + written += sizeof(size_t); + out << sequence; + written += sequence.size(); + written += edges.serialize(out); written += path_steps.serialize(out); return written; } void node_t::load(std::istream& in) { - in.read((char*)&_seq_bytes, sizeof(uint32_t)); - in.read((char*)&_edge_bytes, sizeof(uint32_t)); - in.read((char*)&_edge_count, sizeof(uint32_t)); - uint64_t node_size = 0; - in.read((char*)&node_size, sizeof(node_size)); - bytes.resize(node_size); - in.read((char*)bytes.data(), node_size*sizeof(uint8_t)); + size_t seq_size; + in.read((char*)&seq_size, sizeof(size_t)); + sequence.resize(seq_size); + in.read((char*)sequence.c_str(), seq_size); + edges.load(in); path_steps.load(in); } void node_t::display(void) const { - std::cerr << "self_bytes " << bytes.size() << " " - << "seq_bytes " << seq_bytes() << " " - << "seq " << sequence() << " " - << "edge_start " << edge_start() << " " + std::cerr << "seq " << sequence << " " << "edge_count " << edge_count() << " " - << "edge_bytes " << edge_bytes() << " " << "path_count " << path_count() << " | "; - for (auto i : bytes) { - std::cerr << (int) i << " "; + if (edge_count()) { + for (uint64_t i = 0; i < edge_count(); ++i) { + std::cerr + << edges.at(i) << ":" + << edges.at(i+1) << " "; + } } - std::cerr << " | "; + std::cerr << "| "; if (path_count()) { for (uint64_t i = 0; i < path_count(); ++i) { std::cerr diff --git a/src/node.hpp b/src/node.hpp index 967f778c..c4ae8e16 100644 --- a/src/node.hpp +++ b/src/node.hpp @@ -20,21 +20,14 @@ const uint8_t PATH_RECORD_LENGTH = 5; /// A node object with the sequence, its edge lists, and paths class node_t { - std::vector bytes; + std::string sequence; + dyn::hacked_vector edges; dyn::hacked_vector path_steps; - uint32_t _seq_bytes = 0; - uint32_t _edge_bytes = 0; - uint32_t _edge_count = 0; public: inline const uint64_t seq_start(void) const { return 0; } - inline const uint64_t seq_bytes(void) const { return _seq_bytes; } - inline const uint64_t edge_start(void) const { return _seq_bytes; } - inline const uint64_t edge_count(void) const { return _edge_count; } - inline const uint64_t edge_bytes(void) const { return _edge_bytes; } + inline const uint64_t seq_bytes(void) const { return sequence.size(); } + inline const uint64_t edge_count(void) const { return edges.size()/EDGE_RECORD_LENGTH; } inline const uint64_t path_count(void) const { return path_steps.size()/PATH_RECORD_LENGTH; } - inline void set_seq_bytes(const uint64_t& i) { _seq_bytes = i; } - inline void set_edge_count(const uint64_t& i) { _edge_count = i; } - inline void set_edge_bytes(const uint64_t& i) { _edge_bytes = i; } struct step_t { uint64_t data[5] = { 0, 0, 0, 0, 0 }; // PATH_RECORD_LENGTH step_t(void) { } @@ -65,9 +58,9 @@ class node_t { inline void set_next_rank(const uint64_t& i) { data[4] = i; } }; uint64_t sequence_size(void) const; - const std::string sequence(void) const; + const std::string get_sequence(void) const; void set_sequence(const std::string& seq); - std::vector edges(void) const; + const dyn::hacked_vector& get_edges(void) const; void add_edge(const uint64_t& relative_id, const uint64_t& edge_type); void remove_edge(const uint64_t& rank); void add_path_step(const uint64_t& path_id, const bool& is_rev, @@ -86,6 +79,7 @@ class node_t { void remove_path_step(const uint64_t& rank); void update_path_last_bytes(void); void clear(void); + void clear_edges(void); void clear_path_steps(void); uint64_t serialize(std::ostream& out) const; void load(std::istream& in); diff --git a/src/odgi.cpp b/src/odgi.cpp index 92382ede..3ce55bd5 100644 --- a/src/odgi.cpp +++ b/src/odgi.cpp @@ -49,7 +49,7 @@ size_t graph_t::get_length(const handle_t& handle) const { /// Get the sequence of a node, presented in the handle's local forward orientation. std::string graph_t::get_sequence(const handle_t& handle) const { - auto& seq = node_v.at(number_bool_packing::unpack_number(handle)).sequence(); + auto& seq = node_v.at(number_bool_packing::unpack_number(handle)).get_sequence(); return (get_is_reverse(handle) ? reverse_complement(seq) : seq); } @@ -60,7 +60,7 @@ bool graph_t::follow_edges_impl(const handle_t& handle, bool go_left, const std: const node_t& node = node_v.at(number_bool_packing::unpack_number(handle)); bool is_rev = get_is_reverse(handle); nid_t node_id = get_id(handle); - const std::vector node_edges = node.edges(); + auto& node_edges = node.get_edges(); if (node_edges.size() == 0) return true; for (uint64_t i = 0; i < node_edges.size(); i+=2) { // unpack the edge @@ -603,7 +603,7 @@ void graph_t::destroy_edge(const handle_t& left_h, const handle_t& right_h) { nid_t right_node_id = get_id(right_h); nid_t left_node_id = get_id(left_h); - std::vector left_node_edges = left_node.edges(); + auto& left_node_edges = left_node.get_edges(); bool found_edge = false; for (uint64_t i = 0; i < left_node_edges.size(); ) { uint64_t other_id = edge_delta_to_id(left_node_id, left_node_edges.at(i++)); @@ -622,7 +622,7 @@ void graph_t::destroy_edge(const handle_t& left_h, const handle_t& right_h) { } } - std::vector right_node_edges = right_node.edges(); + auto& right_node_edges = right_node.get_edges(); for (uint64_t i = 0; i < right_node_edges.size(); ) { uint64_t other_id = edge_delta_to_id(right_node_id, right_node_edges.at(i++)); uint8_t packed_edge = right_node_edges.at(i++); @@ -1332,8 +1332,8 @@ void graph_t::display(void) const { for (uint64_t i = 0; i < node_v.size(); ++i) { auto& node = node_v.at(i); nid_t node_id = i+1; - std::cerr << node_id << ":" << node.sequence() << " "; - const std::vector node_edges = node.edges(); + std::cerr << node_id << ":" << node.get_sequence() << " "; + auto& node_edges = node.get_edges(); for (uint64_t j = 0; j < node_edges.size(); ++j) { std::cerr << node_edges.at(j) << ","; } @@ -1381,7 +1381,7 @@ void graph_t::to_gfa(std::ostream& out) const { const node_t& node = node_v.at(number_bool_packing::unpack_number(h)); bool is_rev = get_is_reverse(h); nid_t node_id = get_id(h); - const std::vector node_edges = node.edges(); + auto& node_edges = node.get_edges(); for (uint64_t i = 0; i < node_edges.size(); i+=2) { // unpack the edge uint64_t other_id = edge_delta_to_id(node_id, node_edges.at(i));