matsengrp · clarisw · Jan 31, 2023 · Feb 4, 2023 · Feb 4, 2023 · Apr 22, 2023
diff --git a/historydag/dag.py b/historydag/dag.py
@@ -2227,6 +2227,51 @@ def accum_between_clade(dictlist):
             ).values()
         )[0]
 
+    def underestimate_rf_diameter(self):
+        """Returns an underestimate of the RF diameter of the DAG.
+        This estimate is calculated by calculating the maximal sum RF distance
+        between the DAG and a random tree from a topological outlier.
+
+        On a set of DAGs with 2000 or less histories, this underestimate
+        is quite accurate compared to the actual computed RF diameter.
+        """
+        dag_copy = self.copy()
+        dag_copy.trim_optimal_sum_rf_distance(dag_copy, optimal_func=max)
+        ref_history = dag_copy.sample()
+        return self.optimal_rf_distance(ref_history, optimal_func=max)
+
+    def overestimate_rf_diameter(self):
+        """Returns an overestimate of the RF diameter of the DAG.
+        This estimate is calculated by calculating twice of the maximal sum RF
+        distance between the DAG and a random tree from the median tree.
+
+        On a set of DAGs with 2000 or less histories, this underestimate was
+        not close compared to the actual RF diameter. However, the
+        overestimate was never more than twice of the actual RF diameter.
+        """
+        dag_copy = self.copy()
+        dag_copy.trim_optimal_sum_rf_distance(dag_copy, optimal_func=min)
+        ref_history = dag_copy.sample()
+        return 2 * self.optimal_rf_distance(ref_history, optimal_func=max)
+
+    def optimal_one_sided_sum_rf_distance(
+        self,
+        reference_dag: "HistoryDag",
+        optimal_func: Callable[[List[Weight]], Weight] = min,
+    ):
+        """Returns the optimal (min or max) one-sided rooted RF distance to the
+        reference DAG. In other words, returns the number of clades in the
+        reference DAG that are not in the given DAG.
+
+        The given history must be on the same taxa as all trees in the DAG.
+        Since computing reference splits is expensive, it is better to use
+        :meth:``optimal_weight_annotate`` and :meth:``utils.one_sided_rfdistance_funcs``
+        instead of making multiple calls to this method with the same reference
+        history DAG.
+        """
+        kwargs = utils.one_sided_rfdistance_funcs(reference_dag)
+        return self.optimal_weight_annotate(**kwargs, optimal_func=optimal_func)
+
     def optimal_sum_rf_distance(
         self,
         reference_dag: "HistoryDag",

diff --git a/historydag/utils.py b/historydag/utils.py
@@ -637,8 +637,8 @@ def sum_rfdistance_funcs(reference_dag: "HistoryDag"):
     The reference DAG must have the same taxa as all the trees in the DAG on which these count
     functions are used.
 
-    The edge weight is computed using the expression 2 * N[c_e] - |T| where c_e is the clade under
-    the relevant edge, and |T| is the number of trees in the reference dag. This provide rooted RF
+    The edge weight is computed using the expression |T| - 2 * N[c_e] where c_e is the clade under
+    the relevant edge, and |T| is the number of trees in the reference dag. This provides rooted RF
     distances, meaning that the clade below each edge is used for RF distance computation.
 
     The weights are represented by an IntState object and are shifted by a constant K,
@@ -682,6 +682,58 @@ def edge_func(n1, n2):
     return kwargs
 
 
+def one_sided_rfdistance_funcs(reference_dag: "HistoryDag"):
+    """Provides functions to compute the one sided RF distance to a reference tree.
+    In other words, the number of clades in a tree that are not in the reference tree.
+
+    Args:
+        reference_dag: The reference DAG. The distance will be computed in relation
+            to this DAG
+
+    The reference DAG must have the same taxa as all the trees in the DAG on which these count
+    functions are used.
+
+    The edge weight is computed using the expression |T| - N[c_e] where c_e is the clade under
+    the relevant edge, and |T| is the number of trees in the reference dag. This provides rooted RF
+    distances, meaning that the clade below each edge is used for RF distance computation.
+
+    The weights are represented by an IntState object.
+    """
+    N = reference_dag.count_nodes(collapse=True)
+
+    # Remove the UA node clade union from N
+    try:
+        N.pop(frozenset())
+    except KeyError:
+        pass
+
+    num_trees = reference_dag.count_histories()
+
+    def make_intstate(n):
+        return IntState(n, state=n)
+
+    def edge_func(n1, n2):
+        clade = n2.clade_union()
+        if clade in N:
+            weight = num_trees - (1 * N[n2.clade_union()])
+        else:
+            # This clade's count should then just be 0:
+            weight = num_trees
+        return make_intstate(weight)
+
+    kwargs = AddFuncDict(
+        {
+            "start_func": lambda n: make_intstate(0),
+            "edge_weight_func": edge_func,
+            "accum_func": lambda wlist: make_intstate(
+                sum(w.state for w in wlist)
+            ),  # summation over edge weights
+        },
+        name="one_sided_RF_rooted_sum",
+    )
+    return kwargs
+
+
 def make_rfdistance_countfuncs(ref_tree: "HistoryDag", rooted: bool = False):
     """Provides functions to compute Robinson-Foulds (RF) distances of trees in
     a DAG, relative to a fixed reference tree.

diff --git a/tests/test_factory.py b/tests/test_factory.py
@@ -613,6 +613,21 @@ def test_optimal_sum_rf_distance():
             assert calculated_sum == expected_sum
 
 
+def test_one_sided_rf_distance():
+    for dag in dags:
+        for tree in dag:
+            old_tree = tree.copy()
+            tree.convert_to_collapsed()
+
+            # look at trees where the collapsed tree is different from the original
+            if tree.to_newick() != old_tree.to_newick():
+                assert (tree.optimal_one_sided_sum_rf_distance(old_tree) == 0)
+                assert (old_tree.optimal_one_sided_sum_rf_distance(tree) >= 0)
+            else:
+                assert (tree.optimal_one_sided_sum_rf_distance(old_tree) == 0)
+                assert (old_tree.optimal_one_sided_sum_rf_distance(tree) == 0)
+
+
 def test_trim_range():
     for curr_dag in dags + cdags:
         history_dag = curr_dag.copy()