stanfordnlp · AngledLuffa · Nov 9, 2024 · Apr 27, 2024 · Apr 28, 2024 · Apr 29, 2024
diff --git a/src/edu/stanford/nlp/graph/DirectedMultiGraph.java b/src/edu/stanford/nlp/graph/DirectedMultiGraph.java
@@ -666,17 +666,30 @@ public String toString() {
     StringBuilder s = new StringBuilder();
     s.append("{\n");
     s.append("Vertices:\n");
+
+    List<String> lines = new ArrayList<>();
     for (V vertex : outgoingEdges.keySet()) {
-      s.append("  ").append(vertex).append('\n');
+      lines.add("  " + vertex + '\n');
+    }
+    Collections.sort(lines);
+    for (String line : lines) {
+      s.append(line);
     }
+
     s.append("Edges:\n");
+    lines = new ArrayList<>();
     for (V source : outgoingEdges.keySet()) {
       for (V dest : outgoingEdges.get(source).keySet()) {
         for (E edge : outgoingEdges.get(source).get(dest)) {
-          s.append("  ").append(source).append(" -> ").append(dest).append(" : ").append(edge).append('\n');
+          lines.add("  " + source + " -> " + dest + " : " + edge + "\n");
         }
       }
     }
+    Collections.sort(lines);
+    for (String line : lines) {
+      s.append(line);
+    }
+
     s.append('}');
     return s.toString();
   }

diff --git a/src/edu/stanford/nlp/trees/CompositeTreeTransformer.java b/src/edu/stanford/nlp/trees/CompositeTreeTransformer.java
@@ -2,6 +2,7 @@
 
 import java.util.List;
 import java.util.ArrayList;
+import java.util.Arrays;
 
 /**
  * A TreeTransformer that applies component TreeTransformers in order.
@@ -21,6 +22,10 @@ public CompositeTreeTransformer(List<TreeTransformer> tt) {
     transformers.addAll(tt);
   }
 
+  public CompositeTreeTransformer(TreeTransformer ... tt) {
+    transformers.addAll(Arrays.asList(tt));
+  }
+
   public void addTransformer(TreeTransformer tt) {
     transformers.add(tt);
   }

diff --git a/src/edu/stanford/nlp/trees/CoordinationTransformer.java b/src/edu/stanford/nlp/trees/CoordinationTransformer.java
@@ -71,6 +71,14 @@ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation)
     qp = new QPTreeTransformer(performMWETransformation);
   }
 
+  public void debugLine(String prefix, Tree t) {
+    if (t instanceof TreeGraphNode) {
+      log.info(prefix + ((TreeGraphNode) t).toOneLineString());
+    } else {
+      log.info(prefix + t);
+    }
+  }
+
   /**
    * Transforms t if it contains a coordination in a flat structure (CCtransform)
    * and transforms UCP (UCPtransform).
@@ -81,19 +89,19 @@ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation)
   @Override
   public Tree transformTree(Tree t) {
     if (VERBOSE) {
-      log.info("Input to CoordinationTransformer: " + t);
+      debugLine("Input to CoordinationTransformer: ", t);
     }
 
     if (performMWETransformation) {
       t = gappingTransform(t);
       if (VERBOSE) {
-        log.info("After       t = gappingTransform(t);\n:  " + t);
+        debugLine("After t = gappingTransform(t);:   ", t);
       }
     }
 
     t = tn.transformTree(t);
     if (VERBOSE) {
-      log.info("After DependencyTreeTransformer:  " + t);
+      debugLine("After DependencyTreeTransformer:  ", t);
     }
     if (t == null) {
       return t;
@@ -102,61 +110,64 @@ public Tree transformTree(Tree t) {
     if (performMWETransformation) {
       t = MWETransform(t);
       if (VERBOSE) {
-        log.info("After MWETransform:               " + t);
+        debugLine("After MWETransform:               ", t);
       }
 
       t = MWFlatTransform(t);
       if (VERBOSE) {
-        log.info("After MWFlatTransform:            " + t);
+        debugLine("After MWFlatTransform:            ", t);
       }
 
       t = prepCCTransform(t);
       if (VERBOSE) {
-        log.info("After prepCCTransform:               " + t);
+        debugLine("After prepCCTransform:            ", t);
       }
     }
 
     t = UCPtransform(t);
     if (VERBOSE) {
-      log.info("After UCPTransformer:             " + t);
+      debugLine("After UCPTransformer:             ", t);
     }
     t = CCtransform(t);
     if (VERBOSE) {
-      log.info("After CCTransformer:              " + t);
+      debugLine("After CCTransformer:              ", t);
     }
     t = qp.transformTree(t);
     if (VERBOSE) {
-      log.info("After QPTreeTransformer:          " + t);
+      debugLine("After QPTreeTransformer:          ", t);
     }
     t = SQflatten(t);
     if (VERBOSE) {
-      log.info("After SQ flattening:              " + t);
+      debugLine("After SQ flattening:              ", t);
     }
     t = dates.transformTree(t);
     if (VERBOSE) {
-      log.info("After DateTreeTransformer:        " + t);
+      debugLine("After DateTreeTransformer:        ", t);
     }
     t = removeXOverX(t);
     if (VERBOSE) {
-      log.info("After removeXoverX:               " + t);
+      debugLine("After removeXoverX:               ", t);
     }
     t = combineConjp(t);
     if (VERBOSE) {
-      log.info("After combineConjp:               " + t);
+      debugLine("After combineConjp:               ", t);
     }
     t = moveRB(t);
     if (VERBOSE) {
-      log.info("After moveRB:                     " + t);
+      debugLine("After moveRB:                     ", t);
     }
     t = changeSbarToPP(t);
     if (VERBOSE) {
-      log.info("After changeSbarToPP:             " + t);
+      debugLine("After changeSbarToPP:             ", t);
     }
     t = rearrangeNowThat(t);
     if (VERBOSE) {
-      log.info("After rearrangeNowThat:           " + t);
+      debugLine("After rearrangeNowThat:           ", t);
+    }
+    t = mergeYodaVerbs(t);
+    if (VERBOSE) {
+      debugLine("After mergeYodaVerbs:             ", t);
     }
-
     return t;
   }
 
@@ -174,6 +185,19 @@ private static Tree rearrangeNowThat(Tree t) {
   }
 
 
+  private static final TregexPattern mergeYodaVerbsTregex =
+    TregexPattern.compile("VP=home < VBN=vbn $+ (VP=willbe <... {(__=will < will|have|has) ; (VP < (__=be << be|been))})");
+
+  private static final TsurgeonPattern mergeYodaVerbsTsurgeon =
+    Tsurgeon.parseOperation("[createSubtree VP vbn] [move will >-1 home] [move be >-1 home] [prune willbe]");
+
+  private static Tree mergeYodaVerbs(Tree t) {
+    if (t == null) {
+      return t;
+    }
+    return Tsurgeon.processPattern(mergeYodaVerbsTregex, mergeYodaVerbsTsurgeon, t);
+  }
+
   private static final TregexPattern changeSbarToPPTregex =
     TregexPattern.compile("NP < (NP $++ (SBAR=sbar < (IN < /^(?i:after|before|until|since|during)$/ $++ S)))");
 
@@ -704,6 +728,13 @@ private static Tree findCCparent(Tree t, Tree root) {
   private static final TregexPattern BUT_ALSO_PATTERN = TregexPattern.compile("CONJP=conjp < (CC=cc < but) < (RB=rb < also) ?$+ (__=nextNode < (__ < __))");
   private static final TsurgeonPattern BUT_ALSO_OPERATION = Tsurgeon.parseOperation("[move cc $- conjp] [move rb $- cc] [if exists nextNode move rb >1 nextNode] [createSubtree ADVP rb] [delete conjp]");
 
+  /*
+   * "not only" is not a MWE, so break up the CONJP similar to "but also".
+   * compensate for some JJ tagged "only" in this expression
+   */
+  private static final TregexPattern NOT_ONLY_PATTERN = TregexPattern.compile("CONJP|ADVP=conjp < (RB=not < /^(?i)not$/) < (RB|JJ=only < /^(?i)only|just|merely|even$/) ?$+ (__=nextNode < (__ < __))");
+  private static final TsurgeonPattern NOT_ONLY_OPERATION = Tsurgeon.parseOperation("[move not $- conjp] [move only $- not] [if exists nextNode move only >1 nextNode] [if exists nextNode move not >1 nextNode] [createSubtree ADVP not] [createSubtree ADVP only] [delete conjp]");
+
   /* at least / at most / at best / at worst / ... should be treated as if "at"
      was a preposition and the RBS was a noun. Assumes that the MWE "at least"
      has already been extracted. */
@@ -725,6 +756,7 @@ public static Tree MWETransform(Tree t) {
 
     Tsurgeon.processPattern(ACCORDING_TO_PATTERN, ACCORDING_TO_OPERATION, t);
     Tsurgeon.processPattern(BUT_ALSO_PATTERN, BUT_ALSO_OPERATION, t);
+    Tsurgeon.processPattern(NOT_ONLY_PATTERN, NOT_ONLY_OPERATION, t);
     Tsurgeon.processPattern(AT_RBS_PATTERN, AT_RBS_OPERATION, t);
     Tsurgeon.processPattern(AT_ALL_PATTERN, AT_ALL_OPERATION, t);
 

diff --git a/src/edu/stanford/nlp/trees/EnglishPatterns.java b/src/edu/stanford/nlp/trees/EnglishPatterns.java
@@ -86,9 +86,11 @@ public class EnglishPatterns {
 
   /** A list of verbs which are verbs of speaking that easily take an S (as a complement or topicalized)
    *  which is a direct speech ccomp. For example: "He concedes: ``This is a difficult market.''"
+   * <br>
+   * TODO: maybe sign, as in ASL?  sing ... wish?
    */
   public static final String sayVerbRegex =
-    "/^(?i:say|says|said|saying|(?:add|boast|counsel|explain|inform|interject|recall|remark|respond|proclaim|report|claim|shout|whisper|yell)(?:s|ed|ing)?|(?:advis|announc|acknowledg|conced|conclud|decid|declar|observ|stat|not|inton)(?:e|es|ed|ing)|(?:confess)(?:es|ed|ing)?|(?:agree)(?:s|d|ing)?|reply|replied|replies|replying|admit|admits|admitted|admitting|hold|holds|holding|held|write|writes|writing|wrote|tell|tells|telling|told|quipped|quip|quips|quipping|think|thinks|thinking|thought)$/";
+    "/^(?i:say|says|said|saying|(?:add|bellow|bleat|blubber|bluster|boast|boom|bray|call|chant|chirp|claim|complain|coo|counsel|croak|crow|drawl|explain|gasp|inform|interject|pray|proclaim|protest|purr|recall|remark|report|respond|scream|shout|shriek|sigh|sulk|whisper|whoop|yammer|yap|yell|yelp)(?:s|ed|ing)?|(?:advis|announc|acknowledg|cackl|chortl|chuckl|conced|conclud|decid|declar|dron|grip|grous|inton|not|observ|pledg|propos|stat|whin|whing)(?:e|es|ed|ing)|(?:bitch|confess|kibitz|kibbitz|screech)(?:es|ed|ing)?|(?:agree)(?:s|d|ing)?|(?:cr|repl)(?:y|ied|ies|ying)|admit|admits|admitted|admitting|hold|holds|holding|held|write|writes|writing|wrote|tell|tells|telling|told|quipped|quip|quips|quipping|signal|signals|signaled|signalled|signaling|signallingthink|thinks|thinking|thought)$/";
 
 
   // TODO: is there some better pattern to look for? We do not have tag information at this point

diff --git a/src/edu/stanford/nlp/trees/QPTreeTransformer.java b/src/edu/stanford/nlp/trees/QPTreeTransformer.java
@@ -100,6 +100,29 @@ public Tree transformTree(Tree t) {
   private static final TsurgeonPattern splitMoneyTsurgeon =
     Tsurgeon.parseOperation("createSubtree QP left right");
 
+  // This fixes a very rare subset of parses
+  // such as "(NP (QP just about all) the losses) ..."
+  // in fact, that's the only example in ptb3-revised
+  // because of previous MWE combinations, we may already get
+  //     "(NP (QP at least a) day)"
+  //  -> "(NP (QP (ADVP at least) a) day)"
+  // and therefore the flattenAdvmodTsurgeon will also find that parse
+  private static final TregexPattern groupADVPTregex =
+    TregexPattern.compile("NP < (QP <1 RB=first <2 RB=second <3 (DT !$+ __) $++ /^N/)");
+
+  private static final TsurgeonPattern groupADVPTsurgeon =
+    Tsurgeon.parseOperation("createSubtree ADVP first second");
+
+  // Remove QP in a structure such as
+  //   (NP (QP nearly_RB all_DT) stuff_NN)
+  // so that the converter can attach both `nearly` and `all` to `stuff`
+  // not using a nummod, either, which is kind of annoying
+  private static final TregexPattern flattenAdvmodTregex =
+    TregexPattern.compile("NP < (QP=remove <1 ADVP|RB <2 (DT !$+ __) $++ /^N/)");
+
+  private static final TsurgeonPattern flattenAdvmodTsurgeon =
+    Tsurgeon.parseOperation("excise remove remove");
+
   /**
    * Transforms t if it contains one of the following QP structure:
    * <ul>
@@ -121,6 +144,8 @@ public Tree QPtransform(Tree t) {
     }
     t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t);
     t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t);
+    t = Tsurgeon.processPattern(groupADVPTregex, groupADVPTsurgeon, t);
+    t = Tsurgeon.processPattern(flattenAdvmodTregex, flattenAdvmodTsurgeon, t);
     return t;
   }
 

diff --git a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java
@@ -881,11 +881,6 @@ private UniversalEnglishGrammaticalRelations() {}
    * the meaning of the NP.  Also, the enumeration of lists have
    * this relation to the head of the list item.
    * <br>
-   * Also, the enumeration of lists have this relation to the head of
-   * the list item.  For that, we allow the list of constituents which
-   * have a list under them in any of the training data, as the parser
-   * will likely not produce anything else anyway.
-   * <br>
    * PTB: PP NP X S FRAG <br>
    * EWT: SQ SBARQ SINV SBAR NML VP <br>
    * Craft: PRN <br>
@@ -905,9 +900,7 @@ private UniversalEnglishGrammaticalRelations() {}
             // Note that the earlier tregexes are usually enough to cover those phrases, such as when
             // the QP is by itself in an ADJP or NP, but sometimes it can have other siblings such
             // as in the phrase "$ 100 million or more".  In that case, this next expression is needed.
-            "QP < QP=target < /^[$]$/",
-            // Lists are treated as nummod in UD_English-EWT
-            "PP|NP|X|S|FRAG|SQ|SBARQ|SINV|SBAR|NML|VP|PRN|ADJP < LST=target");
+            "QP < QP=target < /^[$]$/");
 
 
   /**
@@ -1019,12 +1012,19 @@ private UniversalEnglishGrammaticalRelations() {}
    * define this to include: interjections (oh, uh-huh, Welcome), fillers (um, ah),
    * and discourse markers (well, like, actually, but not: you know).
    * We also use it for emoticons.
+   * <br>
+   * Also, the enumeration of lists have this relation to the head of
+   * the list item.  For that, we allow the list of constituents which
+   * have a list under them in any of the training data, as the parser
+   * will likely not produce anything else anyway.
    */
    public static final GrammaticalRelation DISCOURSE_ELEMENT =
     new GrammaticalRelation(Language.UniversalEnglish, "discourse", "discourse element",
         MODIFIER, ".*", tregexCompiler,
             "__ < (NFP=target [ < " + WESTERN_SMILEY + " | < " + ASIAN_SMILEY + " ] )",
-            "__ [ < INTJ=target | < (PRN=target <1 /^(?:,|-LRB-)$/ <2 INTJ [ !<3 __ | <3 /^(?:,|-RRB-)$/ ] ) ]");
+            "__ [ < INTJ=target | < (PRN=target <1 /^(?:,|-LRB-)$/ <2 INTJ [ !<3 __ | <3 /^(?:,|-RRB-)$/ ] ) ]",
+            // Lists are treated as discourse in UD_English-EWT as of 2.14
+            "PP|NP|X|S|FRAG|SQ|SBARQ|SINV|SBAR|NML|VP|PRN|ADJP < LST=target");
 
 
   /**
@@ -1312,7 +1312,6 @@ private UniversalEnglishGrammaticalRelations() {}
         MODIFIER,
         "S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR|NP(?:-TMP|-ADV)?", tregexCompiler,
             "NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ $++ CC)",
-            "NP|NP-TMP|NP-ADV|NX|NML < (CONJP=target < (RB < /^(?i:not)$/) < (RB|JJ < /^(?i:only|merely|just)$/) $++ CC|CONJP)",
             // This matches weird/wrong NP-internal preconjuncts where you get (NP PDT (NP NP CC NP)) or similar
             "NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ ) < (NP < CC)",
             "/^S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR$/ < (PDT|DT|CC=target < /^(?i:either|neither|both)$/ $++ CC)",

diff --git a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java
@@ -134,6 +134,9 @@ public static void load() {
       // RB -> PART when it is verbal negation (not or its reductions)
       { "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", "PART" },
 
+      // "not" as part of a phrase such as "not only", "not just", etc is tagged as PART in UD
+      { "@ADVP|CONJP <1 (RB=target < /^(?i:not|n't|nt|t|n)$/) <2 (__ < only|just|merely|even) !<3 __", "PART" },
+
       // Otherwise RB -> ADV
       { "RB=target <... {/.*/}", "ADV" },
 
@@ -165,7 +168,7 @@ public static void load() {
       {"EX", "PRON"},
       {"FW", "X"},
       {"/^JJ.*$/", "ADJ"},
-      {"LS", "X"},
+      {"LS", "NUM"},
       {"MD", "AUX"},
       {"NNS", "NOUN"},
       {"NNP", "PROPN"},

diff --git a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java
@@ -168,6 +168,16 @@ private static BufferedReader getBufferedReader(String source) {
     "adjoin (NP NN@) newnp\n" +
             '\n') +
 
+    // Fix not_RB only_JJ, which should generally be not_RB only_RB
+    // and put it under a CONJP instead of an ADVP
+    ("ADVP|CONJP <1 (__ < /^(?i:not)$/) <2 (JJ=bad < only|just|merely|even) !<3 __\n" +
+     "relabel bad RB\n" +
+     '\n') +
+
+    ("ADVP=bad <1 (__ < /^(?i:not)$/) <2 (RB < only|just|merely|even) !<3 __\n" +
+     "relabel bad CONJP\n" +
+     '\n') +
+
     // Fix some cases of 'as well as' not made into a CONJP unit
     // There are a few other weird cases that should also be reviewed with the tregex
     // well|Well|WELL , as|AS|As . as|AS|As !>(__ > @CONJP)
@@ -1166,6 +1176,15 @@ private static BufferedReader getBufferedReader(String source) {
 
     "") +
 
+    // for structures such as "over a year", "about a decade", etc
+    ("NP < (QP <1 IN=bad <2 (DT !$+ __) $+ /^N/)\n" +
+     "relabel bad RB\n" +
+     "\n") +
+
+    // for structures such as "just over a decade"
+    ("NP < (QP <1 (RB < just) <2 IN=bad <3 (DT !$+ __) $+ /^N/)\n" +
+     "relabel bad RB\n" +
+     "\n") +
 
     ("@QP < (IN|JJ|RBR|RP=bad < about)\n" +
     "relabel bad RB\n" +
@@ -1221,6 +1240,10 @@ private static BufferedReader getBufferedReader(String source) {
     "relabel bad VBN\n" +
             '\n') +
 
+    // First, second, third are treated as LS in PTB
+    // but in UD EWT, GUM, etc they are treated as RB
+    ("@ADVP <: LS=bad\n" +
+     "relabel bad RB\n\n") +
 
     ("@SBAR < (DT|WDT|NN|NNP|RB=bad < that|because|while|Though|Whether)\n" +
     "relabel bad IN\n" +