From 000b005202431108aea837fa8eb730c9c8adfad6 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Sat, 27 Apr 2024 16:13:03 -0700 Subject: [PATCH 01/15] Change the dependency relation of list items to discourse instead of nummod, as described in https://github.com/UniversalDependencies/UD_English-EWT/issues/518 --- .../UniversalEnglishGrammaticalRelations.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java index 67f1703384..2579127dee 100644 --- a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java +++ b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java @@ -881,11 +881,6 @@ private UniversalEnglishGrammaticalRelations() {} * the meaning of the NP. Also, the enumeration of lists have * this relation to the head of the list item. *
- * Also, the enumeration of lists have this relation to the head of - * the list item. For that, we allow the list of constituents which - * have a list under them in any of the training data, as the parser - * will likely not produce anything else anyway. - *
* PTB: PP NP X S FRAG
* EWT: SQ SBARQ SINV SBAR NML VP
* Craft: PRN
@@ -905,9 +900,7 @@ private UniversalEnglishGrammaticalRelations() {} // Note that the earlier tregexes are usually enough to cover those phrases, such as when // the QP is by itself in an ADJP or NP, but sometimes it can have other siblings such // as in the phrase "$ 100 million or more". In that case, this next expression is needed. - "QP < QP=target < /^[$]$/", - // Lists are treated as nummod in UD_English-EWT - "PP|NP|X|S|FRAG|SQ|SBARQ|SINV|SBAR|NML|VP|PRN|ADJP < LST=target"); + "QP < QP=target < /^[$]$/"); /** @@ -1019,12 +1012,19 @@ private UniversalEnglishGrammaticalRelations() {} * define this to include: interjections (oh, uh-huh, Welcome), fillers (um, ah), * and discourse markers (well, like, actually, but not: you know). * We also use it for emoticons. + *
+ * Also, the enumeration of lists have this relation to the head of + * the list item. For that, we allow the list of constituents which + * have a list under them in any of the training data, as the parser + * will likely not produce anything else anyway. */ public static final GrammaticalRelation DISCOURSE_ELEMENT = new GrammaticalRelation(Language.UniversalEnglish, "discourse", "discourse element", MODIFIER, ".*", tregexCompiler, "__ < (NFP=target [ < " + WESTERN_SMILEY + " | < " + ASIAN_SMILEY + " ] )", - "__ [ < INTJ=target | < (PRN=target <1 /^(?:,|-LRB-)$/ <2 INTJ [ !<3 __ | <3 /^(?:,|-RRB-)$/ ] ) ]"); + "__ [ < INTJ=target | < (PRN=target <1 /^(?:,|-LRB-)$/ <2 INTJ [ !<3 __ | <3 /^(?:,|-RRB-)$/ ] ) ]", + // Lists are treated as discourse in UD_English-EWT as of 2.14 + "PP|NP|X|S|FRAG|SQ|SBARQ|SINV|SBAR|NML|VP|PRN|ADJP < LST=target"); /** From d7e703c1a884c92ae315aa0e362e0ea263931bea Mon Sep 17 00:00:00 2001 From: John Bauer Date: Sun, 28 Apr 2024 01:17:15 -0700 Subject: [PATCH 02/15] UPOS for LS can be NUM, not X - for example, first, 1), a) --- src/edu/stanford/nlp/trees/UniversalPOSMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java index 6a5905a0c5..dfc394dfd1 100644 --- a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java +++ b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java @@ -165,7 +165,7 @@ public static void load() { {"EX", "PRON"}, {"FW", "X"}, {"/^JJ.*$/", "ADJ"}, - {"LS", "X"}, + {"LS", "NUM"}, {"MD", "AUX"}, {"NNS", "NOUN"}, {"NNP", "PROPN"}, From b2048f6b831060ef0675f83091b568e5522e2a8d Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 29 Apr 2024 08:06:58 -0700 Subject: [PATCH 03/15] Remove the QP in a structure such as '(NP (QP About a) day)' so that the resulting dependencies both connect to day instead of from about -> a, changing the UD nummod to a det --- src/edu/stanford/nlp/trees/QPTreeTransformer.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/edu/stanford/nlp/trees/QPTreeTransformer.java b/src/edu/stanford/nlp/trees/QPTreeTransformer.java index a727b449eb..67b750de3a 100644 --- a/src/edu/stanford/nlp/trees/QPTreeTransformer.java +++ b/src/edu/stanford/nlp/trees/QPTreeTransformer.java @@ -100,6 +100,16 @@ public Tree transformTree(Tree t) { private static final TsurgeonPattern splitMoneyTsurgeon = Tsurgeon.parseOperation("createSubtree QP left right"); + // Remove QP in a structure such as + // (NP (QP nearly_RB all_DT) stuff_NN) + // so that the converter can attach both `nearly` and `all` to `stuff` + // not using a nummod, either, which is kind of annoying + private static final TregexPattern flattenAdvmodTregex = + TregexPattern.compile("NP < (QP=remove <1 RB <2 (DT !$+ __) $++ /^N/)"); + + private static final TsurgeonPattern flattenAdvmodTsurgeon = + Tsurgeon.parseOperation("excise remove remove"); + /** * Transforms t if it contains one of the following QP structure: *
    @@ -121,6 +131,7 @@ public Tree QPtransform(Tree t) { } t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t); t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t); + t = Tsurgeon.processPattern(flattenAdvmodTregex, flattenAdvmodTsurgeon, t); return t; } From 716841bb67f480e55f75829306269771dde979c7 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 29 Apr 2024 09:28:00 -0700 Subject: [PATCH 04/15] Useful debugging output from CoordinationTransformer. Could think about changing TreeGraphNode to print out the whole tree, but that would presumably mess up some various dependency outputs --- .../nlp/trees/CoordinationTransformer.java | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/edu/stanford/nlp/trees/CoordinationTransformer.java b/src/edu/stanford/nlp/trees/CoordinationTransformer.java index 50c2366140..7af8137f0c 100644 --- a/src/edu/stanford/nlp/trees/CoordinationTransformer.java +++ b/src/edu/stanford/nlp/trees/CoordinationTransformer.java @@ -71,6 +71,14 @@ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation) qp = new QPTreeTransformer(performMWETransformation); } + public void debugLine(String prefix, Tree t) { + if (t instanceof TreeGraphNode) { + log.info(prefix + ((TreeGraphNode) t).toOneLineString()); + } else { + log.info(prefix + t); + } + } + /** * Transforms t if it contains a coordination in a flat structure (CCtransform) * and transforms UCP (UCPtransform). @@ -81,19 +89,19 @@ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation) @Override public Tree transformTree(Tree t) { if (VERBOSE) { - log.info("Input to CoordinationTransformer: " + t); + debugLine("Input to CoordinationTransformer: ", t); } if (performMWETransformation) { t = gappingTransform(t); if (VERBOSE) { - log.info("After t = gappingTransform(t);\n: " + t); + debugLine("After t = gappingTransform(t);: ", t); } } t = tn.transformTree(t); if (VERBOSE) { - log.info("After DependencyTreeTransformer: " + t); + debugLine("After DependencyTreeTransformer: ", t); } if (t == null) { return t; @@ -102,59 +110,59 @@ public Tree transformTree(Tree t) { if (performMWETransformation) { t = MWETransform(t); if (VERBOSE) { - log.info("After MWETransform: " + t); + debugLine("After MWETransform: ", t); } t = MWFlatTransform(t); if (VERBOSE) { - log.info("After MWFlatTransform: " + t); + debugLine("After MWFlatTransform: ", t); } t = prepCCTransform(t); if (VERBOSE) { - log.info("After prepCCTransform: " + t); + debugLine("After prepCCTransform: ", t); } } t = UCPtransform(t); if (VERBOSE) { - log.info("After UCPTransformer: " + t); + debugLine("After UCPTransformer: ", t); } t = CCtransform(t); if (VERBOSE) { - log.info("After CCTransformer: " + t); + debugLine("After CCTransformer: ", t); } t = qp.transformTree(t); if (VERBOSE) { - log.info("After QPTreeTransformer: " + t); + debugLine("After QPTreeTransformer: ", t); } t = SQflatten(t); if (VERBOSE) { - log.info("After SQ flattening: " + t); + debugLine("After SQ flattening: ", t); } t = dates.transformTree(t); if (VERBOSE) { - log.info("After DateTreeTransformer: " + t); + debugLine("After DateTreeTransformer: ", t); } t = removeXOverX(t); if (VERBOSE) { - log.info("After removeXoverX: " + t); + debugLine("After removeXoverX: ", t); } t = combineConjp(t); if (VERBOSE) { - log.info("After combineConjp: " + t); + debugLine("After combineConjp: ", t); } t = moveRB(t); if (VERBOSE) { - log.info("After moveRB: " + t); + debugLine("After moveRB: ", t); } t = changeSbarToPP(t); if (VERBOSE) { - log.info("After changeSbarToPP: " + t); + debugLine("After changeSbarToPP: ", t); } t = rearrangeNowThat(t); if (VERBOSE) { - log.info("After rearrangeNowThat: " + t); + debugLine("After rearrangeNowThat: ", t); } return t; From a1d8326a8ecc43bd3df635bc9c1ff692f325e1d8 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 29 Apr 2024 09:29:41 -0700 Subject: [PATCH 05/15] Also flatten combined RB or ADVP phrases --- .../stanford/nlp/trees/QPTreeTransformer.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/edu/stanford/nlp/trees/QPTreeTransformer.java b/src/edu/stanford/nlp/trees/QPTreeTransformer.java index 67b750de3a..4885e19d4b 100644 --- a/src/edu/stanford/nlp/trees/QPTreeTransformer.java +++ b/src/edu/stanford/nlp/trees/QPTreeTransformer.java @@ -100,12 +100,25 @@ public Tree transformTree(Tree t) { private static final TsurgeonPattern splitMoneyTsurgeon = Tsurgeon.parseOperation("createSubtree QP left right"); + // This fixes a very rare subset of parses + // such as "(NP (QP just about all) the losses) ..." + // in fact, that's the only example in ptb3-revised + // because of previous MWE combinations, we may already get + // "(NP (QP at least a) day)" + // -> "(NP (QP (ADVP at least) a) day)" + // and therefore the flattenAdvmodTsurgeon will also find that parse + private static final TregexPattern groupADVPTregex = + TregexPattern.compile("NP < (QP <1 RB=first <2 RB=second <3 (DT !$+ __) $++ /^N/)"); + + private static final TsurgeonPattern groupADVPTsurgeon = + Tsurgeon.parseOperation("createSubtree ADVP first second"); + // Remove QP in a structure such as // (NP (QP nearly_RB all_DT) stuff_NN) // so that the converter can attach both `nearly` and `all` to `stuff` // not using a nummod, either, which is kind of annoying private static final TregexPattern flattenAdvmodTregex = - TregexPattern.compile("NP < (QP=remove <1 RB <2 (DT !$+ __) $++ /^N/)"); + TregexPattern.compile("NP < (QP=remove <1 ADVP|RB <2 (DT !$+ __) $++ /^N/)"); private static final TsurgeonPattern flattenAdvmodTsurgeon = Tsurgeon.parseOperation("excise remove remove"); @@ -131,6 +144,7 @@ public Tree QPtransform(Tree t) { } t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t); t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t); + t = Tsurgeon.processPattern(groupADVPTregex, groupADVPTsurgeon, t); t = Tsurgeon.processPattern(flattenAdvmodTregex, flattenAdvmodTsurgeon, t); return t; } From 39f29af7da527962671a0ec921316942e4ad52eb Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 29 Apr 2024 18:53:04 -0700 Subject: [PATCH 06/15] Update a couple trees to have adverb types instead of ADP. Need to make sure the XPOS tags get updated in the converter when using the PTB corrector --- .../nlp/trees/treebank/EnglishPTBTreebankCorrector.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java index 2d93578703..eba377a72a 100644 --- a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java +++ b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java @@ -1166,6 +1166,15 @@ private static BufferedReader getBufferedReader(String source) { "") + + // for structures such as "over a year", "about a decade", etc + ("NP < (QP <1 IN=bad <2 (DT !$+ __) $+ /^N/)\n" + + "relabel bad RB\n" + + "\n") + + + // for structures such as "just over a decade" + ("NP < (QP <1 (RB < just) <2 IN=bad <3 (DT !$+ __) $+ /^N/)\n" + + "relabel bad RB\n" + + "\n") + ("@QP < (IN|JJ|RBR|RP=bad < about)\n" + "relabel bad RB\n" + From 8fc2313698c0ce3624170620d1827a10152458f3 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 29 Apr 2024 19:31:03 -0700 Subject: [PATCH 07/15] If the Corrector is used, use its xpos tags as well when building the conll --- .../nlp/trees/ud/UniversalDependenciesConverter.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java index 337d8b5cf2..539c5dd4c2 100644 --- a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java +++ b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java @@ -288,6 +288,12 @@ public static void main(String[] args) { Tree tree = ((TreeToSemanticGraphIterator) sgIterator).getCurrentTree(); if (ptbCorrector != null) { tree = ptbCorrector.transformTree(tree); + List