Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

One sided RF distance #76

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions historydag/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -2227,6 +2227,51 @@ def accum_between_clade(dictlist):
).values()
)[0]

def underestimate_rf_diameter(self):
"""Returns an underestimate of the RF diameter of the DAG.
This estimate is calculated by calculating the maximal sum RF distance
between the DAG and a random tree from a topological outlier.

On a set of DAGs with 2000 or less histories, this underestimate
is quite accurate compared to the actual computed RF diameter.
"""
dag_copy = self.copy()
dag_copy.trim_optimal_sum_rf_distance(dag_copy, optimal_func=max)
ref_history = dag_copy.sample()
return self.optimal_rf_distance(ref_history, optimal_func=max)

def overestimate_rf_diameter(self):
"""Returns an overestimate of the RF diameter of the DAG.
This estimate is calculated by calculating twice of the maximal sum RF
distance between the DAG and a random tree from the median tree.

On a set of DAGs with 2000 or less histories, this underestimate was
not close compared to the actual RF diameter. However, the
overestimate was never more than twice of the actual RF diameter.
"""
dag_copy = self.copy()
dag_copy.trim_optimal_sum_rf_distance(dag_copy, optimal_func=min)
ref_history = dag_copy.sample()
return 2 * self.optimal_rf_distance(ref_history, optimal_func=max)

def optimal_one_sided_sum_rf_distance(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be called just optimal_one_sided_rf_distance since we're not taking a sum over anything... unless I'm misunderstanding!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the method returns something like the sum of one-sided RF distances rather than a single distance. I tried running it one some examples, and it returns a number that is bigger than the one returned by optimal_rf_distance.

Also, from skimming the code in utils.one_side_rfdistance_funcs it looks pretty close to utils.sum_rfdistance_fucs

self,
reference_dag: "HistoryDag",
optimal_func: Callable[[List[Weight]], Weight] = min,
):
"""Returns the optimal (min or max) one-sided rooted RF distance to the
reference DAG. In other words, returns the number of clades in the
reference DAG that are not in the given DAG.

The given history must be on the same taxa as all trees in the DAG.
Since computing reference splits is expensive, it is better to use
:meth:``optimal_weight_annotate`` and :meth:``utils.one_sided_rfdistance_funcs``
instead of making multiple calls to this method with the same reference
history DAG.
"""
kwargs = utils.one_sided_rfdistance_funcs(reference_dag)
return self.optimal_weight_annotate(**kwargs, optimal_func=optimal_func)

def optimal_sum_rf_distance(
self,
reference_dag: "HistoryDag",
Expand Down
56 changes: 54 additions & 2 deletions historydag/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,8 @@ def sum_rfdistance_funcs(reference_dag: "HistoryDag"):
The reference DAG must have the same taxa as all the trees in the DAG on which these count
functions are used.

The edge weight is computed using the expression 2 * N[c_e] - |T| where c_e is the clade under
the relevant edge, and |T| is the number of trees in the reference dag. This provide rooted RF
The edge weight is computed using the expression |T| - 2 * N[c_e] where c_e is the clade under
the relevant edge, and |T| is the number of trees in the reference dag. This provides rooted RF
distances, meaning that the clade below each edge is used for RF distance computation.

The weights are represented by an IntState object and are shifted by a constant K,
Expand Down Expand Up @@ -682,6 +682,58 @@ def edge_func(n1, n2):
return kwargs


def one_sided_rfdistance_funcs(reference_dag: "HistoryDag"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this looks great! I think the argument reference_dag should be called reference_history, since even though it's a history DAG object, it's expected to be tree-shaped (hence an history).

Also, we should make it very clear in the docstring whether the reference_history is expected to be multifurcating or not, when we're trying to detect resolutions of multifurcating trees.

"""Provides functions to compute the one sided RF distance to a reference tree.
In other words, the number of clades in a tree that are not in the reference tree.

Args:
reference_dag: The reference DAG. The distance will be computed in relation
to this DAG

The reference DAG must have the same taxa as all the trees in the DAG on which these count
functions are used.

The edge weight is computed using the expression |T| - N[c_e] where c_e is the clade under
the relevant edge, and |T| is the number of trees in the reference dag. This provides rooted RF
distances, meaning that the clade below each edge is used for RF distance computation.

The weights are represented by an IntState object.
"""
N = reference_dag.count_nodes(collapse=True)

# Remove the UA node clade union from N
try:
N.pop(frozenset())
except KeyError:
pass

num_trees = reference_dag.count_histories()

def make_intstate(n):
return IntState(n, state=n)

def edge_func(n1, n2):
clade = n2.clade_union()
if clade in N:
weight = num_trees - (1 * N[n2.clade_union()])
else:
# This clade's count should then just be 0:
weight = num_trees
return make_intstate(weight)

kwargs = AddFuncDict(
{
"start_func": lambda n: make_intstate(0),
"edge_weight_func": edge_func,
"accum_func": lambda wlist: make_intstate(
sum(w.state for w in wlist)
), # summation over edge weights
},
name="one_sided_RF_rooted_sum",
)
return kwargs


def make_rfdistance_countfuncs(ref_tree: "HistoryDag", rooted: bool = False):
"""Provides functions to compute Robinson-Foulds (RF) distances of trees in
a DAG, relative to a fixed reference tree.
Expand Down
15 changes: 15 additions & 0 deletions tests/test_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,21 @@ def test_optimal_sum_rf_distance():
assert calculated_sum == expected_sum


def test_one_sided_rf_distance():
for dag in dags:
for tree in dag:
old_tree = tree.copy()
tree.convert_to_collapsed()

# look at trees where the collapsed tree is different from the original
if tree.to_newick() != old_tree.to_newick():
assert (tree.optimal_one_sided_sum_rf_distance(old_tree) == 0)
assert (old_tree.optimal_one_sided_sum_rf_distance(tree) >= 0)
else:
assert (tree.optimal_one_sided_sum_rf_distance(old_tree) == 0)
assert (old_tree.optimal_one_sided_sum_rf_distance(tree) == 0)


def test_trim_range():
for curr_dag in dags + cdags:
history_dag = curr_dag.copy()
Expand Down