Skip to content

Commit

Permalink
Merge pull request #1916 from adamnovak/robustify-add
Browse files Browse the repository at this point in the history
Make vg add more robust
  • Loading branch information
adamnovak authored Oct 8, 2018
2 parents 4bdbf54 + c93fad8 commit ea4aade
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 34 deletions.
12 changes: 6 additions & 6 deletions src/graph_synchronizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,12 +301,12 @@ set<NodeSide> GraphSynchronizer::Lock::get_peripheral_attachments(NodeSide graph
}
}

vector<Translation> GraphSynchronizer::Lock::apply_edit(const Path& path) {
vector<Translation> GraphSynchronizer::Lock::apply_edit(const Path& path, size_t max_node_size) {
set<NodeSide> dangling;
return apply_edit(path, dangling);
return apply_edit(path, dangling, max_node_size);
}

vector<Translation> GraphSynchronizer::Lock::apply_edit(const Path& path, set<NodeSide>& dangling) {
vector<Translation> GraphSynchronizer::Lock::apply_edit(const Path& path, set<NodeSide>& dangling, size_t max_node_size) {
// Make sure we have exclusive ownership of the graph itself since we're
// going to be modifying its data structures.
std::lock_guard<std::mutex> guard(synchronizer.whole_graph_lock);
Expand All @@ -320,7 +320,7 @@ vector<Translation> GraphSynchronizer::Lock::apply_edit(const Path& path, set<No
}

// Make all the edits, passing along the dangling node set.
vector<Translation> translations = synchronizer.graph.edit_fast(path, dangling);
vector<Translation> translations = synchronizer.graph.edit_fast(path, dangling, max_node_size);

// Lock all the nodes that result from the translations. They're guaranteed
// to either be nodes we already have or novel nodes with fresh IDs.
Expand All @@ -347,7 +347,7 @@ vector<Translation> GraphSynchronizer::Lock::apply_edit(const Path& path, set<No
return translations;
}

vector<Translation> GraphSynchronizer::Lock::apply_full_length_edit(const Path& path) {
vector<Translation> GraphSynchronizer::Lock::apply_full_length_edit(const Path& path, size_t max_node_size) {
// Find the left and right outer nodesides of the subgraph
auto ends = get_endpoints();

Expand All @@ -357,7 +357,7 @@ vector<Translation> GraphSynchronizer::Lock::apply_full_length_edit(const Path&
// Apply the edit, attaching its left end to the stuff attached to the left
// end of the graph. Get back in the dangling set where the right end of the
// edit's material is.
auto translations = apply_edit(path, dangling);
auto translations = apply_edit(path, dangling, max_node_size);

// Get the places that the right end of the graph attaches to
auto right_periphery = get_peripheral_attachments(ends.second);
Expand Down
6 changes: 3 additions & 3 deletions src/graph_synchronizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ class GraphSynchronizer {
* The set will be populated with the NodeSides for the ends of nodes
* created/visited at the end of the alignment.
*/
vector<Translation> apply_edit(const Path& path, set<NodeSide>& dangling);
vector<Translation> apply_edit(const Path& path, set<NodeSide>& dangling, size_t max_node_size = 1024);

/**
* May only be called when locked. Apply a path as an edit to the base
* graph, leaving new nodes at the ends of the path unattached on their
* outer sides.
*/
vector<Translation> apply_edit(const Path& path);
vector<Translation> apply_edit(const Path& path, size_t max_node_size = 1024);

/**
* May only be called when locked. Apply a path as an edit to the base
Expand All @@ -139,7 +139,7 @@ class GraphSynchronizer {
* The alignment must be in the local forward orientation of the graph
* for this to make sense.
*/
vector<Translation> apply_full_length_edit(const Path& path);
vector<Translation> apply_full_length_edit(const Path& path, size_t max_node_size = 1024);

protected:

Expand Down
50 changes: 33 additions & 17 deletions src/unittest/variant_adder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,9 +381,9 @@ TEST_CASE( "The smart aligner should use mapping offsets on huge deletions", "[v

string graph_json = R"({
"node": [
{"id": 1, "sequence": "GCGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"},
{"id": 1, "sequence": "GCGCAAAAAAAAAAAAAAAAAAAA"},
{"id": 2, "sequence": "<10kAs>"},
{"id": 3, "sequence": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGCGC"}],
{"id": 3, "sequence": "AAAAAAAAAAAAAAAAAAAAGCGC"}],
"edge": [
{"from": 1, "to": 2},
{"from": 1, "to": 3},
Expand Down Expand Up @@ -466,9 +466,9 @@ TEST_CASE( "The smart aligner should find existing huge deletions", "[variantadd

string graph_json = R"({
"node": [
{"id": 1, "sequence": "GCGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"},
{"id": 1, "sequence": "GCGCAAAAAAAAAAAAAAAAAAAA"},
{"id": 2, "sequence": "<10kAs>"},
{"id": 3, "sequence": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGCGC"}],
{"id": 3, "sequence": "AAAAAAAAAAAAAAAAAAAAGCGC"}],
"edge": [
{"from": 1, "to": 2},
{"from": 1, "to": 3},
Expand Down Expand Up @@ -581,32 +581,48 @@ TEST_CASE( "The smart aligner should use deletion edits on medium deletions", "[
REQUIRE(aligned.sequence() == deleted);
}

SECTION("the resulting alignment should have one mapping") {
REQUIRE(aligned.path().mapping_size() == 1);
SECTION("the resulting alignment should have one or more mappings") {

auto& m = aligned.path().mapping(0);
REQUIRE(aligned.path().mapping_size() >= 1);

SECTION("that mapping should have 3 edits") {
REQUIRE(m.edit_size() == 3);
auto& mFirst = aligned.path().mapping(0);
auto& mLast = aligned.path().mapping(aligned.path().mapping_size() - 1);

vector<Edit> middles;
for (size_t i = 0; i < aligned.path().mapping_size(); i++) {
for(size_t j = 0; j < aligned.path().mapping(i).edit_size(); j++) {
if ((i != 0 || j != 0) && (i != aligned.path().mapping_size() - 1 || j != aligned.path().mapping(i).edit_size() - 1)) {
// Collect all the non-end mappings
middles.push_back(aligned.path().mapping(i).edit(j));
}
}
}

SECTION("the mappings should ahve leading and trailing match edits and itnernal deletions") {
REQUIRE(mFirst.edit_size() >= 1);
REQUIRE(mLast.edit_size() >= 1);

auto& match1 = m.edit(0);
auto& del = m.edit(1);
auto& match2 = m.edit(2);
auto& match1 = mFirst.edit(0);
auto& match2 = mLast.edit(mLast.edit_size() - 1);

SECTION("the first edit should be a match of the leading ref part") {
REQUIRE(edit_is_match(match1));
}

SECTION("the second edit should be a deletion of the deleted sequence") {
REQUIRE(edit_is_deletion(del));
REQUIRE(del.from_length() == 100 - 21);
SECTION("the middle edits should be a deletion of the deleted sequence") {
size_t total_deleted = 0;
for (auto& del : middles) {
REQUIRE(edit_is_deletion(del));
total_deleted += del.from_length();
}
REQUIRE(total_deleted == 100 - 21);
}

SECTION("the third edit should be a match of the trailing ref part") {
SECTION("the last edit should be a match of the trailing ref part") {
REQUIRE(edit_is_match(match2));
}

SECTION("the first and third edits together should add up to the aligned sequence's length") {
SECTION("the first and last edits together should add up to the aligned sequence's length") {
REQUIRE(match1.from_length() + match2.from_length() == deleted.size());
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/variant_adder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ VariantAdder::VariantAdder(VG& graph) : graph(graph), sync(graph) {

// Make sure to dice nodes to 1024 or smaller, the max size that GCSA2
// supports, in case we need to GCSA-index part of the graph.
graph.dice_nodes(1024);
graph.dice_nodes(max_node_size);

// Configure the aligner to use a full length bonus
aligner.full_length_bonus = 5;
Expand Down Expand Up @@ -348,7 +348,7 @@ void VariantAdder::add_variants(vcflib::VariantCallFile* vcf) {

// Make this path's edits to the original graph. We don't need to do
// anything with the translations.
lock.apply_full_length_edit(aln.path());
lock.apply_full_length_edit(aln.path(), max_node_size);

// Count all the bases in the haplotype
total_haplotype_bases += to_align.str().size();
Expand Down
10 changes: 8 additions & 2 deletions src/variant_adder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ class VariantAdder : public NameMapper, public Progressive {
*/
void align_ns(vg::VG& graph, Alignment& aln);

/// What's the maximum node size we should produce, and the size we should
/// chop the input graph to? Since alt sequences are forced out to node
/// boundaries, it makes sense for this to be small relative to
/// whole_alignment_cutoff.
size_t max_node_size = 32;

/// How wide of a range in bases should we look for nearby variants in?
size_t variant_range = 50;

Expand All @@ -85,11 +91,11 @@ class VariantAdder : public NameMapper, public Progressive {
/// which we can just use permissive banding and large band padding? If
/// either is larger than this, we use the pinned-alignment-based do-each-
/// end-and-splice mode.
size_t whole_alignment_cutoff = 1000;
size_t whole_alignment_cutoff = 4096;

/// When we're above that cutoff, what amount of band padding can we use
/// looking for an existing version of our sequence?
size_t large_alignment_band_padding = 20;
size_t large_alignment_band_padding = 30;

/// When we're doing a restricted band padding alignment, how good does it
/// have to be, as a fraction of the perfect match score for the whole
Expand Down
4 changes: 2 additions & 2 deletions src/vg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4757,7 +4757,7 @@ vector<Translation> VG::edit(vector<Path>& paths_to_add, bool save_paths, bool u
}

// The not quite as robust (TODO: how?) but actually efficient way to edit the graph.
vector<Translation> VG::edit_fast(const Path& path, set<NodeSide>& dangling) {
vector<Translation> VG::edit_fast(const Path& path, set<NodeSide>& dangling, size_t max_node_size) {
// Collect the breakpoints
map<id_t, set<pos_t>> breakpoints;

Expand Down Expand Up @@ -4795,7 +4795,7 @@ vector<Translation> VG::edit_fast(const Path& path, set<NodeSide>& dangling) {
// we will record the nodes that we add, so we can correctly make the returned translation for novel insert nodes
map<Node*, Path> added_nodes;
// create new nodes/wire things up.
add_nodes_and_edges(path, node_translation, added_seqs, added_nodes, orig_node_sizes, dangling);
add_nodes_and_edges(path, node_translation, added_seqs, added_nodes, orig_node_sizes, dangling, max_node_size);

// Make the translations (in about the same format as VG::edit(), but
// without a translation for every single node and with just the nodes we
Expand Down
2 changes: 1 addition & 1 deletion src/vg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ class VG : public Progressive, public MutableHandleGraph, public PathHandleGraph
/// edits that hit the end of a node to be attached to what comes
/// before/after the node by the caller, as this function doesn't handle
/// that.
vector<Translation> edit_fast(const Path& path, set<NodeSide>& dangling);
vector<Translation> edit_fast(const Path& path, set<NodeSide>& dangling, size_t max_node_size = 1024);

/// Find all the points at which a Path enters or leaves nodes in the graph. Adds
/// them to the given map by node ID of sets of bases in the node that will need
Expand Down
2 changes: 1 addition & 1 deletion test/t/31_vg_add.t
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ is "$?" "0" "vg add can create a slightly larger graph"

is "$(vg view -c x.vg | jq -c '.path[].mapping[] | select(.rank | not)' | wc -l)" "0" "ranks are calculated for emitted paths"

is "$(vg view -Jv add/backward.json | vg add -v add/benedict.vcf - | vg stats -N -)" "5" "graphs with backward nodes can be added to"
is "$(vg view -Jv add/backward.json | vg add -v add/benedict.vcf - | vg mod --unchop - | vg stats -N -)" "5" "graphs with backward nodes can be added to"

is "$(vg view -Jv add/backward_and_forward.json | vg add -v add/benedict.vcf - | vg mod --unchop - | vg stats -N -)" "5" "graphs with backward and forward nodes can be added to"

Expand Down

1 comment on commit ea4aade

@cgcloud-jenkins
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jenkins vg tests complete for merge to master. View the full report here.

19 tests passed, 0 tests failed and 10 tests skipped in 21651 seconds

Tests produced 2787 warnings. 2786 were for lower-than-expected alignment scores

Please sign in to comment.