From 000b005202431108aea837fa8eb730c9c8adfad6 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Sat, 27 Apr 2024 16:13:03 -0700
Subject: [PATCH 01/15] Change the dependency relation of list items to
 discourse instead of nummod, as described in
 https://github.com/UniversalDependencies/UD_English-EWT/issues/518

---
 .../UniversalEnglishGrammaticalRelations.java  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java
index 67f1703384..2579127dee 100644
--- a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java
+++ b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java
@@ -881,11 +881,6 @@ private UniversalEnglishGrammaticalRelations() {}
    * the meaning of the NP.  Also, the enumeration of lists have
    * this relation to the head of the list item.
    * <br>
-   * Also, the enumeration of lists have this relation to the head of
-   * the list item.  For that, we allow the list of constituents which
-   * have a list under them in any of the training data, as the parser
-   * will likely not produce anything else anyway.
-   * <br>
    * PTB: PP NP X S FRAG <br>
    * EWT: SQ SBARQ SINV SBAR NML VP <br>
    * Craft: PRN <br>
@@ -905,9 +900,7 @@ private UniversalEnglishGrammaticalRelations() {}
             // Note that the earlier tregexes are usually enough to cover those phrases, such as when
             // the QP is by itself in an ADJP or NP, but sometimes it can have other siblings such
             // as in the phrase "$ 100 million or more".  In that case, this next expression is needed.
-            "QP < QP=target < /^[$]$/",
-            // Lists are treated as nummod in UD_English-EWT
-            "PP|NP|X|S|FRAG|SQ|SBARQ|SINV|SBAR|NML|VP|PRN|ADJP < LST=target");
+            "QP < QP=target < /^[$]$/");
 
 
   /**
@@ -1019,12 +1012,19 @@ private UniversalEnglishGrammaticalRelations() {}
    * define this to include: interjections (oh, uh-huh, Welcome), fillers (um, ah),
    * and discourse markers (well, like, actually, but not: you know).
    * We also use it for emoticons.
+   * <br>
+   * Also, the enumeration of lists have this relation to the head of
+   * the list item.  For that, we allow the list of constituents which
+   * have a list under them in any of the training data, as the parser
+   * will likely not produce anything else anyway.
    */
    public static final GrammaticalRelation DISCOURSE_ELEMENT =
     new GrammaticalRelation(Language.UniversalEnglish, "discourse", "discourse element",
         MODIFIER, ".*", tregexCompiler,
             "__ < (NFP=target [ < " + WESTERN_SMILEY + " | < " + ASIAN_SMILEY + " ] )",
-            "__ [ < INTJ=target | < (PRN=target <1 /^(?:,|-LRB-)$/ <2 INTJ [ !<3 __ | <3 /^(?:,|-RRB-)$/ ] ) ]");
+            "__ [ < INTJ=target | < (PRN=target <1 /^(?:,|-LRB-)$/ <2 INTJ [ !<3 __ | <3 /^(?:,|-RRB-)$/ ] ) ]",
+            // Lists are treated as discourse in UD_English-EWT as of 2.14
+            "PP|NP|X|S|FRAG|SQ|SBARQ|SINV|SBAR|NML|VP|PRN|ADJP < LST=target");
 
 
   /**

From d7e703c1a884c92ae315aa0e362e0ea263931bea Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Sun, 28 Apr 2024 01:17:15 -0700
Subject: [PATCH 02/15] UPOS for LS can be NUM, not X - for example, first, 1),
 a)

---
 src/edu/stanford/nlp/trees/UniversalPOSMapper.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java
index 6a5905a0c5..dfc394dfd1 100644
--- a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java
+++ b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java
@@ -165,7 +165,7 @@ public static void load() {
       {"EX", "PRON"},
       {"FW", "X"},
       {"/^JJ.*$/", "ADJ"},
-      {"LS", "X"},
+      {"LS", "NUM"},
       {"MD", "AUX"},
       {"NNS", "NOUN"},
       {"NNP", "PROPN"},

From b2048f6b831060ef0675f83091b568e5522e2a8d Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Mon, 29 Apr 2024 08:06:58 -0700
Subject: [PATCH 03/15] Remove the QP in a structure such as '(NP (QP About a)
 day)' so that the resulting dependencies both connect to day instead of from
 about -> a, changing the UD nummod to a det

---
 src/edu/stanford/nlp/trees/QPTreeTransformer.java | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/edu/stanford/nlp/trees/QPTreeTransformer.java b/src/edu/stanford/nlp/trees/QPTreeTransformer.java
index a727b449eb..67b750de3a 100644
--- a/src/edu/stanford/nlp/trees/QPTreeTransformer.java
+++ b/src/edu/stanford/nlp/trees/QPTreeTransformer.java
@@ -100,6 +100,16 @@ public Tree transformTree(Tree t) {
   private static final TsurgeonPattern splitMoneyTsurgeon =
     Tsurgeon.parseOperation("createSubtree QP left right");
 
+  // Remove QP in a structure such as
+  //   (NP (QP nearly_RB all_DT) stuff_NN)
+  // so that the converter can attach both `nearly` and `all` to `stuff`
+  // not using a nummod, either, which is kind of annoying
+  private static final TregexPattern flattenAdvmodTregex =
+    TregexPattern.compile("NP < (QP=remove <1 RB <2 (DT !$+ __) $++ /^N/)");
+
+  private static final TsurgeonPattern flattenAdvmodTsurgeon =
+    Tsurgeon.parseOperation("excise remove remove");
+
   /**
    * Transforms t if it contains one of the following QP structure:
    * <ul>
@@ -121,6 +131,7 @@ public Tree QPtransform(Tree t) {
     }
     t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t);
     t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t);
+    t = Tsurgeon.processPattern(flattenAdvmodTregex, flattenAdvmodTsurgeon, t);
     return t;
   }
 

From 716841bb67f480e55f75829306269771dde979c7 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Mon, 29 Apr 2024 09:28:00 -0700
Subject: [PATCH 04/15] Useful debugging output from CoordinationTransformer. 
 Could think about changing TreeGraphNode to print out the whole tree, but
 that would presumably mess up some various dependency outputs

---
 .../nlp/trees/CoordinationTransformer.java    | 40 +++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/edu/stanford/nlp/trees/CoordinationTransformer.java b/src/edu/stanford/nlp/trees/CoordinationTransformer.java
index 50c2366140..7af8137f0c 100644
--- a/src/edu/stanford/nlp/trees/CoordinationTransformer.java
+++ b/src/edu/stanford/nlp/trees/CoordinationTransformer.java
@@ -71,6 +71,14 @@ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation)
     qp = new QPTreeTransformer(performMWETransformation);
   }
 
+  public void debugLine(String prefix, Tree t) {
+    if (t instanceof TreeGraphNode) {
+      log.info(prefix + ((TreeGraphNode) t).toOneLineString());
+    } else {
+      log.info(prefix + t);
+    }
+  }
+
   /**
    * Transforms t if it contains a coordination in a flat structure (CCtransform)
    * and transforms UCP (UCPtransform).
@@ -81,19 +89,19 @@ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation)
   @Override
   public Tree transformTree(Tree t) {
     if (VERBOSE) {
-      log.info("Input to CoordinationTransformer: " + t);
+      debugLine("Input to CoordinationTransformer: ", t);
     }
 
     if (performMWETransformation) {
       t = gappingTransform(t);
       if (VERBOSE) {
-        log.info("After       t = gappingTransform(t);\n:  " + t);
+        debugLine("After t = gappingTransform(t);:   ", t);
       }
     }
 
     t = tn.transformTree(t);
     if (VERBOSE) {
-      log.info("After DependencyTreeTransformer:  " + t);
+      debugLine("After DependencyTreeTransformer:  ", t);
     }
     if (t == null) {
       return t;
@@ -102,59 +110,59 @@ public Tree transformTree(Tree t) {
     if (performMWETransformation) {
       t = MWETransform(t);
       if (VERBOSE) {
-        log.info("After MWETransform:               " + t);
+        debugLine("After MWETransform:               ", t);
       }
 
       t = MWFlatTransform(t);
       if (VERBOSE) {
-        log.info("After MWFlatTransform:            " + t);
+        debugLine("After MWFlatTransform:            ", t);
       }
 
       t = prepCCTransform(t);
       if (VERBOSE) {
-        log.info("After prepCCTransform:               " + t);
+        debugLine("After prepCCTransform:            ", t);
       }
     }
 
     t = UCPtransform(t);
     if (VERBOSE) {
-      log.info("After UCPTransformer:             " + t);
+      debugLine("After UCPTransformer:             ", t);
     }
     t = CCtransform(t);
     if (VERBOSE) {
-      log.info("After CCTransformer:              " + t);
+      debugLine("After CCTransformer:              ", t);
     }
     t = qp.transformTree(t);
     if (VERBOSE) {
-      log.info("After QPTreeTransformer:          " + t);
+      debugLine("After QPTreeTransformer:          ", t);
     }
     t = SQflatten(t);
     if (VERBOSE) {
-      log.info("After SQ flattening:              " + t);
+      debugLine("After SQ flattening:              ", t);
     }
     t = dates.transformTree(t);
     if (VERBOSE) {
-      log.info("After DateTreeTransformer:        " + t);
+      debugLine("After DateTreeTransformer:        ", t);
     }
     t = removeXOverX(t);
     if (VERBOSE) {
-      log.info("After removeXoverX:               " + t);
+      debugLine("After removeXoverX:               ", t);
     }
     t = combineConjp(t);
     if (VERBOSE) {
-      log.info("After combineConjp:               " + t);
+      debugLine("After combineConjp:               ", t);
     }
     t = moveRB(t);
     if (VERBOSE) {
-      log.info("After moveRB:                     " + t);
+      debugLine("After moveRB:                     ", t);
     }
     t = changeSbarToPP(t);
     if (VERBOSE) {
-      log.info("After changeSbarToPP:             " + t);
+      debugLine("After changeSbarToPP:             ", t);
     }
     t = rearrangeNowThat(t);
     if (VERBOSE) {
-      log.info("After rearrangeNowThat:           " + t);
+      debugLine("After rearrangeNowThat:           ", t);
     }
 
     return t;

From a1d8326a8ecc43bd3df635bc9c1ff692f325e1d8 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Mon, 29 Apr 2024 09:29:41 -0700
Subject: [PATCH 05/15] Also flatten combined RB or ADVP phrases

---
 .../stanford/nlp/trees/QPTreeTransformer.java    | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/edu/stanford/nlp/trees/QPTreeTransformer.java b/src/edu/stanford/nlp/trees/QPTreeTransformer.java
index 67b750de3a..4885e19d4b 100644
--- a/src/edu/stanford/nlp/trees/QPTreeTransformer.java
+++ b/src/edu/stanford/nlp/trees/QPTreeTransformer.java
@@ -100,12 +100,25 @@ public Tree transformTree(Tree t) {
   private static final TsurgeonPattern splitMoneyTsurgeon =
     Tsurgeon.parseOperation("createSubtree QP left right");
 
+  // This fixes a very rare subset of parses
+  // such as "(NP (QP just about all) the losses) ..."
+  // in fact, that's the only example in ptb3-revised
+  // because of previous MWE combinations, we may already get
+  //     "(NP (QP at least a) day)"
+  //  -> "(NP (QP (ADVP at least) a) day)"
+  // and therefore the flattenAdvmodTsurgeon will also find that parse
+  private static final TregexPattern groupADVPTregex =
+    TregexPattern.compile("NP < (QP <1 RB=first <2 RB=second <3 (DT !$+ __) $++ /^N/)");
+
+  private static final TsurgeonPattern groupADVPTsurgeon =
+    Tsurgeon.parseOperation("createSubtree ADVP first second");
+
   // Remove QP in a structure such as
   //   (NP (QP nearly_RB all_DT) stuff_NN)
   // so that the converter can attach both `nearly` and `all` to `stuff`
   // not using a nummod, either, which is kind of annoying
   private static final TregexPattern flattenAdvmodTregex =
-    TregexPattern.compile("NP < (QP=remove <1 RB <2 (DT !$+ __) $++ /^N/)");
+    TregexPattern.compile("NP < (QP=remove <1 ADVP|RB <2 (DT !$+ __) $++ /^N/)");
 
   private static final TsurgeonPattern flattenAdvmodTsurgeon =
     Tsurgeon.parseOperation("excise remove remove");
@@ -131,6 +144,7 @@ public Tree QPtransform(Tree t) {
     }
     t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t);
     t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t);
+    t = Tsurgeon.processPattern(groupADVPTregex, groupADVPTsurgeon, t);
     t = Tsurgeon.processPattern(flattenAdvmodTregex, flattenAdvmodTsurgeon, t);
     return t;
   }

From 39f29af7da527962671a0ec921316942e4ad52eb Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Mon, 29 Apr 2024 18:53:04 -0700
Subject: [PATCH 06/15] Update a couple trees to have adverb types instead of
 ADP.  Need to make sure the XPOS tags get updated in the converter when using
 the PTB corrector

---
 .../nlp/trees/treebank/EnglishPTBTreebankCorrector.java  | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
index 2d93578703..eba377a72a 100644
--- a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
+++ b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
@@ -1166,6 +1166,15 @@ private static BufferedReader getBufferedReader(String source) {
 
     "") +
 
+    // for structures such as "over a year", "about a decade", etc
+    ("NP < (QP <1 IN=bad <2 (DT !$+ __) $+ /^N/)\n" +
+     "relabel bad RB\n" +
+     "\n") +
+
+    // for structures such as "just over a decade"
+    ("NP < (QP <1 (RB < just) <2 IN=bad <3 (DT !$+ __) $+ /^N/)\n" +
+     "relabel bad RB\n" +
+     "\n") +
 
     ("@QP < (IN|JJ|RBR|RP=bad < about)\n" +
     "relabel bad RB\n" +

From 8fc2313698c0ce3624170620d1827a10152458f3 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Mon, 29 Apr 2024 19:31:03 -0700
Subject: [PATCH 07/15] If the Corrector is used, use its xpos tags as well
 when building the conll

---
 .../nlp/trees/ud/UniversalDependenciesConverter.java        | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java
index 337d8b5cf2..539c5dd4c2 100644
--- a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java
+++ b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java
@@ -288,6 +288,12 @@ public static void main(String[] args) {
         Tree tree = ((TreeToSemanticGraphIterator) sgIterator).getCurrentTree();
         if (ptbCorrector != null) {
           tree = ptbCorrector.transformTree(tree);
+          List<Label> xposLabels = tree.preTerminalYield();
+          for (IndexedWord token: sg.vertexListSorted()) {
+            int idx = token.index() - 1;
+            String xposTag = xposLabels.get(idx).value();
+            token.set(CoreAnnotations.PartOfSpeechAnnotation.class, xposTag);
+          }
         }
         Tree uposTree = UniversalPOSMapper.mapTree(tree);
         List<Label> uposLabels = uposTree.preTerminalYield();

From 05f32a3c69b55dfe3a2441017876035982761a55 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Mon, 29 Apr 2024 23:02:33 -0700
Subject: [PATCH 08/15] Add an array-like constructor for the
 CompositeTreeTransformer

---
 src/edu/stanford/nlp/trees/CompositeTreeTransformer.java | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/edu/stanford/nlp/trees/CompositeTreeTransformer.java b/src/edu/stanford/nlp/trees/CompositeTreeTransformer.java
index 583fdca730..6a7b4dc219 100644
--- a/src/edu/stanford/nlp/trees/CompositeTreeTransformer.java
+++ b/src/edu/stanford/nlp/trees/CompositeTreeTransformer.java
@@ -2,6 +2,7 @@
 
 import java.util.List;
 import java.util.ArrayList;
+import java.util.Arrays;
 
 /**
  * A TreeTransformer that applies component TreeTransformers in order.
@@ -21,6 +22,10 @@ public CompositeTreeTransformer(List<TreeTransformer> tt) {
     transformers.addAll(tt);
   }
 
+  public CompositeTreeTransformer(TreeTransformer ... tt) {
+    transformers.addAll(Arrays.asList(tt));
+  }
+
   public void addTransformer(TreeTransformer tt) {
     transformers.add(tt);
   }

From bc43e10d171f3ed512def098fd783e601034c4ec Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Tue, 30 Apr 2024 19:09:14 -0700
Subject: [PATCH 09/15] Move the corrector earlier in the UDCoverter process. 
 Uses the corrected trees for the structure of the UD graphs, not just the
 tags.  Noticeably reduces the number of validator errors

---
 .../ud/UniversalDependenciesConverter.java    | 45 +++++++++++++------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java
index 539c5dd4c2..8dbd581a19 100644
--- a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java
+++ b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java
@@ -90,8 +90,11 @@ private static class TreeToSemanticGraphIterator implements Iterator<Pair<Semant
     private Iterator<Tree> treeIterator;
     private Tree currentTree; // = null;
 
-    public TreeToSemanticGraphIterator(Iterator<Tree> treeIterator) {
+    private TreeTransformer corrector; // = null;
+
+    public TreeToSemanticGraphIterator(Iterator<Tree> treeIterator, TreeTransformer corrector) {
       this.treeIterator = treeIterator;
+      this.corrector = corrector;
     }
 
     @Override
@@ -102,6 +105,25 @@ public boolean hasNext() {
     @Override
     public Pair<SemanticGraph, SemanticGraph> next() {
       Tree t = treeIterator.next();
+      if (corrector != null) {
+        t = corrector.transformTree(t);
+        // The corrector uses tsurgeon, with two limitations:
+        //   - adjoin nodes don't set word(), just set value()
+        //   - rearranging tags doesn't update the tag() of a leaf
+        List<Tree> preterminals = Trees.preTerminals(t);
+        for (Tree preterminal : preterminals) {
+          assert preterminal.children().length == 1;
+          Tree leaf = preterminal.children()[0];
+          if (!(leaf.label() instanceof CoreLabel)) {
+            throw new RuntimeException("These should all be CoreLabels!");
+          }
+          CoreLabel leafWord = (CoreLabel) leaf.label();
+          if (leafWord.word() == null && leafWord.value() != null) {
+            leafWord.setWord(leafWord.value());
+          }
+          leafWord.setTag(preterminal.value());
+        }
+      }
       currentTree = t;
       return new Pair<>(convertTreeToBasic(t), null);
     }
@@ -246,10 +268,15 @@ public static void main(String[] args) {
     Iterator<Pair<SemanticGraph, SemanticGraph>> sgIterator; // = null;
 
     if (treeFileName != null) {
-      MemoryTreebank tb = new MemoryTreebank(new NPTmpRetainingTreeNormalizer(0, false, 1, false, true));
+      NPTmpRetainingTreeNormalizer normalizer = new NPTmpRetainingTreeNormalizer(0, false, 1, false, true);
+      MemoryTreebank tb = new MemoryTreebank(normalizer);
       tb.loadPath(treeFileName);
       Iterator<Tree> treeIterator = tb.iterator();
-      sgIterator = new TreeToSemanticGraphIterator(treeIterator);
+      TreeTransformer ptbCorrector = null;
+      if (correctPTB) {
+        ptbCorrector = new CompositeTreeTransformer(new EnglishPTBTreebankCorrector(), normalizer);
+      }
+      sgIterator = new TreeToSemanticGraphIterator(treeIterator, ptbCorrector);
     } else if (conlluFileName != null) {
       CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
       try {
@@ -274,27 +301,17 @@ public static void main(String[] args) {
 
     UniversalDependenciesFeatureAnnotator featureAnnotator = (addFeatures) ? new UniversalDependenciesFeatureAnnotator() : null;
     EnglishMWTCombiner mwtCombiner = (combineMWTs) ? new EnglishMWTCombiner() : null;
-    EnglishPTBTreebankCorrector ptbCorrector = (correctPTB) ? new EnglishPTBTreebankCorrector() : null;
 
     CoNLLUDocumentWriter writer = new CoNLLUDocumentWriter();
 
     int graphIdx = 0;
     while (sgIterator.hasNext()) {
-      Pair<SemanticGraph, SemanticGraph> sgs = sgIterator.next();
+      final Pair<SemanticGraph, SemanticGraph> sgs = sgIterator.next();
       SemanticGraph sg = sgs.first();
 
       if (treeFileName != null) {
         //add UPOS tags
         Tree tree = ((TreeToSemanticGraphIterator) sgIterator).getCurrentTree();
-        if (ptbCorrector != null) {
-          tree = ptbCorrector.transformTree(tree);
-          List<Label> xposLabels = tree.preTerminalYield();
-          for (IndexedWord token: sg.vertexListSorted()) {
-            int idx = token.index() - 1;
-            String xposTag = xposLabels.get(idx).value();
-            token.set(CoreAnnotations.PartOfSpeechAnnotation.class, xposTag);
-          }
-        }
         Tree uposTree = UniversalPOSMapper.mapTree(tree);
         List<Label> uposLabels = uposTree.preTerminalYield();
         for (IndexedWord token: sg.vertexListSorted()) {

From c32dada53c1a7a181f8fbf80231780be5bdb5d33 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Sat, 18 May 2024 11:35:01 -0700
Subject: [PATCH 10/15] Alphabetize the regex for English say patterns

---
 src/edu/stanford/nlp/trees/EnglishPatterns.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/edu/stanford/nlp/trees/EnglishPatterns.java b/src/edu/stanford/nlp/trees/EnglishPatterns.java
index 2bba7be5ef..70bd055cc7 100644
--- a/src/edu/stanford/nlp/trees/EnglishPatterns.java
+++ b/src/edu/stanford/nlp/trees/EnglishPatterns.java
@@ -88,7 +88,7 @@ public class EnglishPatterns {
    *  which is a direct speech ccomp. For example: "He concedes: ``This is a difficult market.''"
    */
   public static final String sayVerbRegex =
-    "/^(?i:say|says|said|saying|(?:add|boast|counsel|explain|inform|interject|recall|remark|respond|proclaim|report|claim|shout|whisper|yell)(?:s|ed|ing)?|(?:advis|announc|acknowledg|conced|conclud|decid|declar|observ|stat|not|inton)(?:e|es|ed|ing)|(?:confess)(?:es|ed|ing)?|(?:agree)(?:s|d|ing)?|reply|replied|replies|replying|admit|admits|admitted|admitting|hold|holds|holding|held|write|writes|writing|wrote|tell|tells|telling|told|quipped|quip|quips|quipping|think|thinks|thinking|thought)$/";
+    "/^(?i:say|says|said|saying|(?:add|boast|claim|counsel|explain|inform|interject|proclaim|recall|remark|report|respond|shout|whisper|yell)(?:s|ed|ing)?|(?:advis|announc|acknowledg|conced|conclud|decid|declar|inton|not|observ|stat)(?:e|es|ed|ing)|(?:confess)(?:es|ed|ing)?|(?:agree)(?:s|d|ing)?|reply|replied|replies|replying|admit|admits|admitted|admitting|hold|holds|holding|held|write|writes|writing|wrote|tell|tells|telling|told|quipped|quip|quips|quipping|think|thinks|thinking|thought)$/";
 
 
   // TODO: is there some better pattern to look for? We do not have tag information at this point

From b35af47278be4f02bfd3aa83c8f55dc4af350414 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Sat, 18 May 2024 12:23:51 -0700
Subject: [PATCH 11/15] Add a bunch of words not previously included in the say
 regex

gripes and complains both show up in PTB in terms of checking the results.  most of the other verbs added do not
---
 src/edu/stanford/nlp/trees/EnglishPatterns.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/edu/stanford/nlp/trees/EnglishPatterns.java b/src/edu/stanford/nlp/trees/EnglishPatterns.java
index 70bd055cc7..3487f98cb8 100644
--- a/src/edu/stanford/nlp/trees/EnglishPatterns.java
+++ b/src/edu/stanford/nlp/trees/EnglishPatterns.java
@@ -86,9 +86,11 @@ public class EnglishPatterns {
 
   /** A list of verbs which are verbs of speaking that easily take an S (as a complement or topicalized)
    *  which is a direct speech ccomp. For example: "He concedes: ``This is a difficult market.''"
+   * <br>
+   * TODO: maybe sign, as in ASL?  sing ... wish?
    */
   public static final String sayVerbRegex =
-    "/^(?i:say|says|said|saying|(?:add|boast|claim|counsel|explain|inform|interject|proclaim|recall|remark|report|respond|shout|whisper|yell)(?:s|ed|ing)?|(?:advis|announc|acknowledg|conced|conclud|decid|declar|inton|not|observ|stat)(?:e|es|ed|ing)|(?:confess)(?:es|ed|ing)?|(?:agree)(?:s|d|ing)?|reply|replied|replies|replying|admit|admits|admitted|admitting|hold|holds|holding|held|write|writes|writing|wrote|tell|tells|telling|told|quipped|quip|quips|quipping|think|thinks|thinking|thought)$/";
+    "/^(?i:say|says|said|saying|(?:add|bellow|bleat|blubber|bluster|boast|boom|bray|call|chant|chirp|claim|complain|coo|counsel|croak|crow|drawl|explain|gasp|inform|interject|pray|proclaim|protest|purr|recall|remark|report|respond|scream|shout|shriek|sigh|sulk|whisper|whoop|yammer|yap|yell|yelp)(?:s|ed|ing)?|(?:advis|announc|acknowledg|cackl|chortl|chuckl|conced|conclud|decid|declar|dron|grip|grous|inton|not|observ|pledg|propos|stat|whin|whing)(?:e|es|ed|ing)|(?:bitch|confess|kibitz|kibbitz|screech)(?:es|ed|ing)?|(?:agree)(?:s|d|ing)?|(?:cr|repl)(?:y|ied|ies|ying)|admit|admits|admitted|admitting|hold|holds|holding|held|write|writes|writing|wrote|tell|tells|telling|told|quipped|quip|quips|quipping|signal|signals|signaled|signalled|signaling|signallingthink|thinks|thinking|thought)$/";
 
 
   // TODO: is there some better pattern to look for? We do not have tag information at this point

From 129acecc1f6ffc1dab3642b88c03daf5b2f2bf02 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Tue, 4 Jun 2024 17:53:52 -0700
Subject: [PATCH 12/15] Fixes Yoda speak dependencies

actually, there's only one in PTB - Also excluded will be investments ...
---
 .../nlp/trees/CoordinationTransformer.java     | 18 +++++++++++++++++-
 .../trees/EnglishGrammaticalStructureTest.java |  2 +-
 ...iversalEnglishGrammaticalStructureTest.java |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/edu/stanford/nlp/trees/CoordinationTransformer.java b/src/edu/stanford/nlp/trees/CoordinationTransformer.java
index 7af8137f0c..debf10c151 100644
--- a/src/edu/stanford/nlp/trees/CoordinationTransformer.java
+++ b/src/edu/stanford/nlp/trees/CoordinationTransformer.java
@@ -164,7 +164,10 @@ public Tree transformTree(Tree t) {
     if (VERBOSE) {
       debugLine("After rearrangeNowThat:           ", t);
     }
-
+    t = mergeYodaVerbs(t);
+    if (VERBOSE) {
+      debugLine("After mergeYodaVerbs:             ", t);
+    }
     return t;
   }
 
@@ -182,6 +185,19 @@ private static Tree rearrangeNowThat(Tree t) {
   }
 
 
+  private static final TregexPattern mergeYodaVerbsTregex =
+    TregexPattern.compile("VP=home < VBN=vbn $+ (VP=willbe <... {(__=will < will|have|has) ; (VP < (__=be << be|been))})");
+
+  private static final TsurgeonPattern mergeYodaVerbsTsurgeon =
+    Tsurgeon.parseOperation("[createSubtree VP vbn] [move will >-1 home] [move be >-1 home] [prune willbe]");
+
+  private static Tree mergeYodaVerbs(Tree t) {
+    if (t == null) {
+      return t;
+    }
+    return Tsurgeon.processPattern(mergeYodaVerbsTregex, mergeYodaVerbsTsurgeon, t);
+  }
+
   private static final TregexPattern changeSbarToPPTregex =
     TregexPattern.compile("NP < (NP $++ (SBAR=sbar < (IN < /^(?i:after|before|until|since|during)$/ $++ S)))");
 
diff --git a/test/src/edu/stanford/nlp/trees/EnglishGrammaticalStructureTest.java b/test/src/edu/stanford/nlp/trees/EnglishGrammaticalStructureTest.java
index 1763058d21..ed1e6ec9fe 100644
--- a/test/src/edu/stanford/nlp/trees/EnglishGrammaticalStructureTest.java
+++ b/test/src/edu/stanford/nlp/trees/EnglishGrammaticalStructureTest.java
@@ -285,7 +285,7 @@ public void testBasicRelations() {
                 "aux(excluded-7, will-5)\n" +
                 "auxpass(excluded-7, be-6)\n" + "root(ROOT-0, excluded-7)\n",
         "advmod(excluded-2, Also-1)\n" + "root(ROOT-0, excluded-2)\n" +
-                "aux(be-4, will-3)\n" +     // should really be aux(excluded-2, will-3) but impossible at present without reconstructing topicalized VP semantic head
+                "aux(excluded-2, will-3)\n" +
                 "auxpass(excluded-2, be-4)\n" +
                 "nsubjpass(excluded-2, investments-5)\n" +
                 "prep(investments-5, in-6)\n" +
diff --git a/test/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalStructureTest.java b/test/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalStructureTest.java
index 1bb6d2fc56..f19f90d545 100644
--- a/test/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalStructureTest.java
+++ b/test/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalStructureTest.java
@@ -580,7 +580,7 @@ public static Collection<Object[]> testCases() {
           "( (SINV (ADVP (RB Also)) (VP (VBN excluded)) (VP (MD will) (VP (VB be))) (NP (NP (NNS investments)) (PP (IN in) (NP (NNP South) (NNP Africa)))) (. .)))",
           "advmod(excluded-2, Also-1)\n" +
            "root(ROOT-0, excluded-2)\n" +
-           "aux(be-4, will-3)\n" + // should really be aux(excluded-2, will-3) but impossible at present without reconstructing topicalized VP semantic head
+           "aux(excluded-2, will-3)\n" +
            "aux:pass(excluded-2, be-4)\n" +
            "nsubj:pass(excluded-2, investments-5)\n" +
            "case(Africa-8, in-6)\n" +

From 1535cab985bd88976084584cc4dcec3793abcda3 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Tue, 4 Jun 2024 20:37:50 -0700
Subject: [PATCH 13/15] Sort the lines of the graph when printing it for debug
 purposes, so that random hash ordering doesn't cause the graph to have a
 different output

---
 .../stanford/nlp/graph/DirectedMultiGraph.java  | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/edu/stanford/nlp/graph/DirectedMultiGraph.java b/src/edu/stanford/nlp/graph/DirectedMultiGraph.java
index 26639d8f50..52b3171c27 100644
--- a/src/edu/stanford/nlp/graph/DirectedMultiGraph.java
+++ b/src/edu/stanford/nlp/graph/DirectedMultiGraph.java
@@ -666,17 +666,30 @@ public String toString() {
     StringBuilder s = new StringBuilder();
     s.append("{\n");
     s.append("Vertices:\n");
+
+    List<String> lines = new ArrayList<>();
     for (V vertex : outgoingEdges.keySet()) {
-      s.append("  ").append(vertex).append('\n');
+      lines.add("  " + vertex + '\n');
+    }
+    Collections.sort(lines);
+    for (String line : lines) {
+      s.append(line);
     }
+
     s.append("Edges:\n");
+    lines = new ArrayList<>();
     for (V source : outgoingEdges.keySet()) {
       for (V dest : outgoingEdges.get(source).keySet()) {
         for (E edge : outgoingEdges.get(source).get(dest)) {
-          s.append("  ").append(source).append(" -> ").append(dest).append(" : ").append(edge).append('\n');
+          lines.add("  " + source + " -> " + dest + " : " + edge + "\n");
         }
       }
     }
+    Collections.sort(lines);
+    for (String line : lines) {
+      s.append(line);
+    }
+
     s.append('}');
     return s.toString();
   }

From 2945cac886c6757c872f6e22bcda9a0774356e26 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Thu, 7 Nov 2024 09:34:46 -0800
Subject: [PATCH 14/15] update LS First -> RB First in the Treebank Corrector

---
 .../nlp/trees/treebank/EnglishPTBTreebankCorrector.java       | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
index eba377a72a..21d6c5aa64 100644
--- a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
+++ b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
@@ -1230,6 +1230,10 @@ private static BufferedReader getBufferedReader(String source) {
     "relabel bad VBN\n" +
             '\n') +
 
+    // First, second, third are treated as LS in PTB
+    // but in UD EWT, GUM, etc they are treated as RB
+    ("@ADVP <: LS=bad\n" +
+     "relabel bad RB\n\n") +
 
     ("@SBAR < (DT|WDT|NN|NNP|RB=bad < that|because|while|Though|Whether)\n" +
     "relabel bad IN\n" +

From a2de4607e2997bb2fed7e6e8ed5f8cfdf76cf62c Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Thu, 7 Nov 2024 21:05:56 -0800
Subject: [PATCH 15/15] When processing "not only" and similar phrases into UD,
 separate them from the CONJP (sometimes ADVP by error) that they show up in. 
 This allows the later part of the converter to connect both of them to the
 parent with advmod.

As part of this, turn the UPOS of "not" into PART

Also, update the corrector to make a few changes to the structure, which may help usages of the trees or of SD as well as the UD.  The UD changes are written to accommodate the structural errors in the original PTB, though
---
 .../stanford/nlp/trees/CoordinationTransformer.java  |  8 ++++++++
 .../trees/UniversalEnglishGrammaticalRelations.java  |  1 -
 src/edu/stanford/nlp/trees/UniversalPOSMapper.java   |  3 +++
 .../trees/treebank/EnglishPTBTreebankCorrector.java  | 10 ++++++++++
 .../UniversalEnglishGrammaticalStructureTest.java    | 12 ++++++------
 5 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/src/edu/stanford/nlp/trees/CoordinationTransformer.java b/src/edu/stanford/nlp/trees/CoordinationTransformer.java
index debf10c151..6d5883ae94 100644
--- a/src/edu/stanford/nlp/trees/CoordinationTransformer.java
+++ b/src/edu/stanford/nlp/trees/CoordinationTransformer.java
@@ -728,6 +728,13 @@ private static Tree findCCparent(Tree t, Tree root) {
   private static final TregexPattern BUT_ALSO_PATTERN = TregexPattern.compile("CONJP=conjp < (CC=cc < but) < (RB=rb < also) ?$+ (__=nextNode < (__ < __))");
   private static final TsurgeonPattern BUT_ALSO_OPERATION = Tsurgeon.parseOperation("[move cc $- conjp] [move rb $- cc] [if exists nextNode move rb >1 nextNode] [createSubtree ADVP rb] [delete conjp]");
 
+  /*
+   * "not only" is not a MWE, so break up the CONJP similar to "but also".
+   * compensate for some JJ tagged "only" in this expression
+   */
+  private static final TregexPattern NOT_ONLY_PATTERN = TregexPattern.compile("CONJP|ADVP=conjp < (RB=not < /^(?i)not$/) < (RB|JJ=only < /^(?i)only|just|merely|even$/) ?$+ (__=nextNode < (__ < __))");
+  private static final TsurgeonPattern NOT_ONLY_OPERATION = Tsurgeon.parseOperation("[move not $- conjp] [move only $- not] [if exists nextNode move only >1 nextNode] [if exists nextNode move not >1 nextNode] [createSubtree ADVP not] [createSubtree ADVP only] [delete conjp]");
+
   /* at least / at most / at best / at worst / ... should be treated as if "at"
      was a preposition and the RBS was a noun. Assumes that the MWE "at least"
      has already been extracted. */
@@ -749,6 +756,7 @@ public static Tree MWETransform(Tree t) {
     
     Tsurgeon.processPattern(ACCORDING_TO_PATTERN, ACCORDING_TO_OPERATION, t);
     Tsurgeon.processPattern(BUT_ALSO_PATTERN, BUT_ALSO_OPERATION, t);
+    Tsurgeon.processPattern(NOT_ONLY_PATTERN, NOT_ONLY_OPERATION, t);
     Tsurgeon.processPattern(AT_RBS_PATTERN, AT_RBS_OPERATION, t);
     Tsurgeon.processPattern(AT_ALL_PATTERN, AT_ALL_OPERATION, t);
 
diff --git a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java
index 2579127dee..22c8b96ade 100644
--- a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java
+++ b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java
@@ -1312,7 +1312,6 @@ private UniversalEnglishGrammaticalRelations() {}
         MODIFIER,
         "S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR|NP(?:-TMP|-ADV)?", tregexCompiler,
             "NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ $++ CC)",
-            "NP|NP-TMP|NP-ADV|NX|NML < (CONJP=target < (RB < /^(?i:not)$/) < (RB|JJ < /^(?i:only|merely|just)$/) $++ CC|CONJP)",
             // This matches weird/wrong NP-internal preconjuncts where you get (NP PDT (NP NP CC NP)) or similar
             "NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ ) < (NP < CC)",
             "/^S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR$/ < (PDT|DT|CC=target < /^(?i:either|neither|both)$/ $++ CC)",
diff --git a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java
index dfc394dfd1..1840e583d8 100644
--- a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java
+++ b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java
@@ -134,6 +134,9 @@ public static void load() {
       // RB -> PART when it is verbal negation (not or its reductions)
       { "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", "PART" },
 
+      // "not" as part of a phrase such as "not only", "not just", etc is tagged as PART in UD
+      { "@ADVP|CONJP <1 (RB=target < /^(?i:not|n't|nt|t|n)$/) <2 (__ < only|just|merely|even) !<3 __", "PART" },
+
       // Otherwise RB -> ADV
       { "RB=target <... {/.*/}", "ADV" },
 
diff --git a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
index 21d6c5aa64..da14689740 100644
--- a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
+++ b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
@@ -168,6 +168,16 @@ private static BufferedReader getBufferedReader(String source) {
     "adjoin (NP NN@) newnp\n" +
             '\n') +
 
+    // Fix not_RB only_JJ, which should generally be not_RB only_RB
+    // and put it under a CONJP instead of an ADVP
+    ("ADVP|CONJP <1 (__ < /^(?i:not)$/) <2 (JJ=bad < only|just|merely|even) !<3 __\n" +
+     "relabel bad RB\n" +
+     '\n') +
+
+    ("ADVP=bad <1 (__ < /^(?i:not)$/) <2 (RB < only|just|merely|even) !<3 __\n" +
+     "relabel bad CONJP\n" +
+     '\n') +
+
     // Fix some cases of 'as well as' not made into a CONJP unit
     // There are a few other weird cases that should also be reviewed with the tregex
     // well|Well|WELL , as|AS|As . as|AS|As !>(__ > @CONJP)
diff --git a/test/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalStructureTest.java b/test/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalStructureTest.java
index f19f90d545..f35a10399b 100644
--- a/test/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalStructureTest.java
+++ b/test/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalStructureTest.java
@@ -522,8 +522,8 @@ public static Collection<Object[]> testCases() {
           "( (S (NP (PRP I)) (VP (VBP like) (NP (CONJP (RB not) (RB only)) (NP (NNS cats)) (CONJP (CC but) (RB also)) (NP (NN dogs)))) (. .)))",
           "nsubj(like-2, I-1)\n" +
            "root(ROOT-0, like-2)\n" +
-           "advmod(only-4, not-3)\n" +
-           "cc:preconj(cats-5, only-4)\n" +
+           "advmod(cats-5, not-3)\n" +
+           "advmod(cats-5, only-4)\n" +
            "obj(like-2, cats-5)\n" +
            "cc(dogs-8, but-6)\n" +
            "advmod(dogs-8, also-7)\n" +
@@ -2497,8 +2497,8 @@ public static Collection<Object[]> testCases() {
               "( (S (NP (PRP I)) (VP (VBP like) (NP (CONJP (RB not) (RB only)) (NP (NNS cats)) (CONJP (CC but) (RB also)) (NP (NN dogs)))) (. .)))",
               "nsubj(like-2, I-1)\n" +
                "root(ROOT-0, like-2)\n" +
-               "advmod(only-4, not-3)\n" +
-               "cc:preconj(cats-5, only-4)\n" +
+               "advmod(cats-5, not-3)\n" +
+               "advmod(cats-5, only-4)\n" +
                "obj(like-2, cats-5)\n" +
                "cc(dogs-8, but-6)\n" +
                "advmod(dogs-8, also-7)\n" +
@@ -2510,8 +2510,8 @@ public static Collection<Object[]> testCases() {
                "nsubj(flew-2', Fred-1)\n" +
                "root(ROOT-0, flew-2)\n" +
                "conj:and(flew-2, flew-2')\n" +
-               "advmod(only-4, not-3)\n" +
-               "cc:preconj(Greece-6, only-4)\n" +
+               "advmod(Greece-6, not-3)\n" +
+               "advmod(Greece-6, only-4)\n" +
                "case(Greece-6, to-5)\n" +
                "obl:to(flew-2, Greece-6)\n" +
                "cc(flew-2', but-7)\n" +