Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ud2 updates #1445

Merged
merged 15 commits into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/edu/stanford/nlp/graph/DirectedMultiGraph.java
Original file line number Diff line number Diff line change
Expand Up @@ -666,17 +666,30 @@ public String toString() {
StringBuilder s = new StringBuilder();
s.append("{\n");
s.append("Vertices:\n");

List<String> lines = new ArrayList<>();
for (V vertex : outgoingEdges.keySet()) {
s.append(" ").append(vertex).append('\n');
lines.add(" " + vertex + '\n');
}
Collections.sort(lines);
for (String line : lines) {
s.append(line);
}

s.append("Edges:\n");
lines = new ArrayList<>();
for (V source : outgoingEdges.keySet()) {
for (V dest : outgoingEdges.get(source).keySet()) {
for (E edge : outgoingEdges.get(source).get(dest)) {
s.append(" ").append(source).append(" -> ").append(dest).append(" : ").append(edge).append('\n');
lines.add(" " + source + " -> " + dest + " : " + edge + "\n");
}
}
}
Collections.sort(lines);
for (String line : lines) {
s.append(line);
}

s.append('}');
return s.toString();
}
Expand Down
5 changes: 5 additions & 0 deletions src/edu/stanford/nlp/trees/CompositeTreeTransformer.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;

/**
* A TreeTransformer that applies component TreeTransformers in order.
Expand All @@ -21,6 +22,10 @@ public CompositeTreeTransformer(List<TreeTransformer> tt) {
transformers.addAll(tt);
}

public CompositeTreeTransformer(TreeTransformer ... tt) {
transformers.addAll(Arrays.asList(tt));
}

public void addTransformer(TreeTransformer tt) {
transformers.add(tt);
}
Expand Down
66 changes: 49 additions & 17 deletions src/edu/stanford/nlp/trees/CoordinationTransformer.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation)
qp = new QPTreeTransformer(performMWETransformation);
}

public void debugLine(String prefix, Tree t) {
if (t instanceof TreeGraphNode) {
log.info(prefix + ((TreeGraphNode) t).toOneLineString());
} else {
log.info(prefix + t);
}
}

/**
* Transforms t if it contains a coordination in a flat structure (CCtransform)
* and transforms UCP (UCPtransform).
Expand All @@ -81,19 +89,19 @@ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation)
@Override
public Tree transformTree(Tree t) {
if (VERBOSE) {
log.info("Input to CoordinationTransformer: " + t);
debugLine("Input to CoordinationTransformer: ", t);
}

if (performMWETransformation) {
t = gappingTransform(t);
if (VERBOSE) {
log.info("After t = gappingTransform(t);\n: " + t);
debugLine("After t = gappingTransform(t);: ", t);
}
}

t = tn.transformTree(t);
if (VERBOSE) {
log.info("After DependencyTreeTransformer: " + t);
debugLine("After DependencyTreeTransformer: ", t);
}
if (t == null) {
return t;
Expand All @@ -102,61 +110,64 @@ public Tree transformTree(Tree t) {
if (performMWETransformation) {
t = MWETransform(t);
if (VERBOSE) {
log.info("After MWETransform: " + t);
debugLine("After MWETransform: ", t);
}

t = MWFlatTransform(t);
if (VERBOSE) {
log.info("After MWFlatTransform: " + t);
debugLine("After MWFlatTransform: ", t);
}

t = prepCCTransform(t);
if (VERBOSE) {
log.info("After prepCCTransform: " + t);
debugLine("After prepCCTransform: ", t);
}
}

t = UCPtransform(t);
if (VERBOSE) {
log.info("After UCPTransformer: " + t);
debugLine("After UCPTransformer: ", t);
}
t = CCtransform(t);
if (VERBOSE) {
log.info("After CCTransformer: " + t);
debugLine("After CCTransformer: ", t);
}
t = qp.transformTree(t);
if (VERBOSE) {
log.info("After QPTreeTransformer: " + t);
debugLine("After QPTreeTransformer: ", t);
}
t = SQflatten(t);
if (VERBOSE) {
log.info("After SQ flattening: " + t);
debugLine("After SQ flattening: ", t);
}
t = dates.transformTree(t);
if (VERBOSE) {
log.info("After DateTreeTransformer: " + t);
debugLine("After DateTreeTransformer: ", t);
}
t = removeXOverX(t);
if (VERBOSE) {
log.info("After removeXoverX: " + t);
debugLine("After removeXoverX: ", t);
}
t = combineConjp(t);
if (VERBOSE) {
log.info("After combineConjp: " + t);
debugLine("After combineConjp: ", t);
}
t = moveRB(t);
if (VERBOSE) {
log.info("After moveRB: " + t);
debugLine("After moveRB: ", t);
}
t = changeSbarToPP(t);
if (VERBOSE) {
log.info("After changeSbarToPP: " + t);
debugLine("After changeSbarToPP: ", t);
}
t = rearrangeNowThat(t);
if (VERBOSE) {
log.info("After rearrangeNowThat: " + t);
debugLine("After rearrangeNowThat: ", t);
}
t = mergeYodaVerbs(t);
if (VERBOSE) {
debugLine("After mergeYodaVerbs: ", t);
}

return t;
}

Expand All @@ -174,6 +185,19 @@ private static Tree rearrangeNowThat(Tree t) {
}


private static final TregexPattern mergeYodaVerbsTregex =
TregexPattern.compile("VP=home < VBN=vbn $+ (VP=willbe <... {(__=will < will|have|has) ; (VP < (__=be << be|been))})");

private static final TsurgeonPattern mergeYodaVerbsTsurgeon =
Tsurgeon.parseOperation("[createSubtree VP vbn] [move will >-1 home] [move be >-1 home] [prune willbe]");

private static Tree mergeYodaVerbs(Tree t) {
if (t == null) {
return t;
}
return Tsurgeon.processPattern(mergeYodaVerbsTregex, mergeYodaVerbsTsurgeon, t);
}

private static final TregexPattern changeSbarToPPTregex =
TregexPattern.compile("NP < (NP $++ (SBAR=sbar < (IN < /^(?i:after|before|until|since|during)$/ $++ S)))");

Expand Down Expand Up @@ -704,6 +728,13 @@ private static Tree findCCparent(Tree t, Tree root) {
private static final TregexPattern BUT_ALSO_PATTERN = TregexPattern.compile("CONJP=conjp < (CC=cc < but) < (RB=rb < also) ?$+ (__=nextNode < (__ < __))");
private static final TsurgeonPattern BUT_ALSO_OPERATION = Tsurgeon.parseOperation("[move cc $- conjp] [move rb $- cc] [if exists nextNode move rb >1 nextNode] [createSubtree ADVP rb] [delete conjp]");

/*
* "not only" is not a MWE, so break up the CONJP similar to "but also".
* compensate for some JJ tagged "only" in this expression
*/
private static final TregexPattern NOT_ONLY_PATTERN = TregexPattern.compile("CONJP|ADVP=conjp < (RB=not < /^(?i)not$/) < (RB|JJ=only < /^(?i)only|just|merely|even$/) ?$+ (__=nextNode < (__ < __))");
private static final TsurgeonPattern NOT_ONLY_OPERATION = Tsurgeon.parseOperation("[move not $- conjp] [move only $- not] [if exists nextNode move only >1 nextNode] [if exists nextNode move not >1 nextNode] [createSubtree ADVP not] [createSubtree ADVP only] [delete conjp]");

/* at least / at most / at best / at worst / ... should be treated as if "at"
was a preposition and the RBS was a noun. Assumes that the MWE "at least"
has already been extracted. */
Expand All @@ -725,6 +756,7 @@ public static Tree MWETransform(Tree t) {

Tsurgeon.processPattern(ACCORDING_TO_PATTERN, ACCORDING_TO_OPERATION, t);
Tsurgeon.processPattern(BUT_ALSO_PATTERN, BUT_ALSO_OPERATION, t);
Tsurgeon.processPattern(NOT_ONLY_PATTERN, NOT_ONLY_OPERATION, t);
Tsurgeon.processPattern(AT_RBS_PATTERN, AT_RBS_OPERATION, t);
Tsurgeon.processPattern(AT_ALL_PATTERN, AT_ALL_OPERATION, t);

Expand Down
4 changes: 3 additions & 1 deletion src/edu/stanford/nlp/trees/EnglishPatterns.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,11 @@ public class EnglishPatterns {

/** A list of verbs which are verbs of speaking that easily take an S (as a complement or topicalized)
* which is a direct speech ccomp. For example: "He concedes: ``This is a difficult market.''"
* <br>
* TODO: maybe sign, as in ASL? sing ... wish?
*/
public static final String sayVerbRegex =
"/^(?i:say|says|said|saying|(?:add|boast|counsel|explain|inform|interject|recall|remark|respond|proclaim|report|claim|shout|whisper|yell)(?:s|ed|ing)?|(?:advis|announc|acknowledg|conced|conclud|decid|declar|observ|stat|not|inton)(?:e|es|ed|ing)|(?:confess)(?:es|ed|ing)?|(?:agree)(?:s|d|ing)?|reply|replied|replies|replying|admit|admits|admitted|admitting|hold|holds|holding|held|write|writes|writing|wrote|tell|tells|telling|told|quipped|quip|quips|quipping|think|thinks|thinking|thought)$/";
"/^(?i:say|says|said|saying|(?:add|bellow|bleat|blubber|bluster|boast|boom|bray|call|chant|chirp|claim|complain|coo|counsel|croak|crow|drawl|explain|gasp|inform|interject|pray|proclaim|protest|purr|recall|remark|report|respond|scream|shout|shriek|sigh|sulk|whisper|whoop|yammer|yap|yell|yelp)(?:s|ed|ing)?|(?:advis|announc|acknowledg|cackl|chortl|chuckl|conced|conclud|decid|declar|dron|grip|grous|inton|not|observ|pledg|propos|stat|whin|whing)(?:e|es|ed|ing)|(?:bitch|confess|kibitz|kibbitz|screech)(?:es|ed|ing)?|(?:agree)(?:s|d|ing)?|(?:cr|repl)(?:y|ied|ies|ying)|admit|admits|admitted|admitting|hold|holds|holding|held|write|writes|writing|wrote|tell|tells|telling|told|quipped|quip|quips|quipping|signal|signals|signaled|signalled|signaling|signallingthink|thinks|thinking|thought)$/";


// TODO: is there some better pattern to look for? We do not have tag information at this point
Expand Down
25 changes: 25 additions & 0 deletions src/edu/stanford/nlp/trees/QPTreeTransformer.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,29 @@ public Tree transformTree(Tree t) {
private static final TsurgeonPattern splitMoneyTsurgeon =
Tsurgeon.parseOperation("createSubtree QP left right");

// This fixes a very rare subset of parses
// such as "(NP (QP just about all) the losses) ..."
// in fact, that's the only example in ptb3-revised
// because of previous MWE combinations, we may already get
// "(NP (QP at least a) day)"
// -> "(NP (QP (ADVP at least) a) day)"
// and therefore the flattenAdvmodTsurgeon will also find that parse
private static final TregexPattern groupADVPTregex =
TregexPattern.compile("NP < (QP <1 RB=first <2 RB=second <3 (DT !$+ __) $++ /^N/)");

private static final TsurgeonPattern groupADVPTsurgeon =
Tsurgeon.parseOperation("createSubtree ADVP first second");

// Remove QP in a structure such as
// (NP (QP nearly_RB all_DT) stuff_NN)
// so that the converter can attach both `nearly` and `all` to `stuff`
// not using a nummod, either, which is kind of annoying
private static final TregexPattern flattenAdvmodTregex =
TregexPattern.compile("NP < (QP=remove <1 ADVP|RB <2 (DT !$+ __) $++ /^N/)");

private static final TsurgeonPattern flattenAdvmodTsurgeon =
Tsurgeon.parseOperation("excise remove remove");

/**
* Transforms t if it contains one of the following QP structure:
* <ul>
Expand All @@ -121,6 +144,8 @@ public Tree QPtransform(Tree t) {
}
t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t);
t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t);
t = Tsurgeon.processPattern(groupADVPTregex, groupADVPTsurgeon, t);
t = Tsurgeon.processPattern(flattenAdvmodTregex, flattenAdvmodTsurgeon, t);
return t;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -881,11 +881,6 @@ private UniversalEnglishGrammaticalRelations() {}
* the meaning of the NP. Also, the enumeration of lists have
* this relation to the head of the list item.
* <br>
* Also, the enumeration of lists have this relation to the head of
* the list item. For that, we allow the list of constituents which
* have a list under them in any of the training data, as the parser
* will likely not produce anything else anyway.
* <br>
* PTB: PP NP X S FRAG <br>
* EWT: SQ SBARQ SINV SBAR NML VP <br>
* Craft: PRN <br>
Expand All @@ -905,9 +900,7 @@ private UniversalEnglishGrammaticalRelations() {}
// Note that the earlier tregexes are usually enough to cover those phrases, such as when
// the QP is by itself in an ADJP or NP, but sometimes it can have other siblings such
// as in the phrase "$ 100 million or more". In that case, this next expression is needed.
"QP < QP=target < /^[$]$/",
// Lists are treated as nummod in UD_English-EWT
"PP|NP|X|S|FRAG|SQ|SBARQ|SINV|SBAR|NML|VP|PRN|ADJP < LST=target");
"QP < QP=target < /^[$]$/");


/**
Expand Down Expand Up @@ -1019,12 +1012,19 @@ private UniversalEnglishGrammaticalRelations() {}
* define this to include: interjections (oh, uh-huh, Welcome), fillers (um, ah),
* and discourse markers (well, like, actually, but not: you know).
* We also use it for emoticons.
* <br>
* Also, the enumeration of lists have this relation to the head of
* the list item. For that, we allow the list of constituents which
* have a list under them in any of the training data, as the parser
* will likely not produce anything else anyway.
*/
public static final GrammaticalRelation DISCOURSE_ELEMENT =
new GrammaticalRelation(Language.UniversalEnglish, "discourse", "discourse element",
MODIFIER, ".*", tregexCompiler,
"__ < (NFP=target [ < " + WESTERN_SMILEY + " | < " + ASIAN_SMILEY + " ] )",
"__ [ < INTJ=target | < (PRN=target <1 /^(?:,|-LRB-)$/ <2 INTJ [ !<3 __ | <3 /^(?:,|-RRB-)$/ ] ) ]");
"__ [ < INTJ=target | < (PRN=target <1 /^(?:,|-LRB-)$/ <2 INTJ [ !<3 __ | <3 /^(?:,|-RRB-)$/ ] ) ]",
// Lists are treated as discourse in UD_English-EWT as of 2.14
"PP|NP|X|S|FRAG|SQ|SBARQ|SINV|SBAR|NML|VP|PRN|ADJP < LST=target");


/**
Expand Down Expand Up @@ -1312,7 +1312,6 @@ private UniversalEnglishGrammaticalRelations() {}
MODIFIER,
"S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR|NP(?:-TMP|-ADV)?", tregexCompiler,
"NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ $++ CC)",
"NP|NP-TMP|NP-ADV|NX|NML < (CONJP=target < (RB < /^(?i:not)$/) < (RB|JJ < /^(?i:only|merely|just)$/) $++ CC|CONJP)",
// This matches weird/wrong NP-internal preconjuncts where you get (NP PDT (NP NP CC NP)) or similar
"NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ ) < (NP < CC)",
"/^S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR$/ < (PDT|DT|CC=target < /^(?i:either|neither|both)$/ $++ CC)",
Expand Down
5 changes: 4 additions & 1 deletion src/edu/stanford/nlp/trees/UniversalPOSMapper.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ public static void load() {
// RB -> PART when it is verbal negation (not or its reductions)
{ "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", "PART" },

// "not" as part of a phrase such as "not only", "not just", etc is tagged as PART in UD
{ "@ADVP|CONJP <1 (RB=target < /^(?i:not|n't|nt|t|n)$/) <2 (__ < only|just|merely|even) !<3 __", "PART" },

// Otherwise RB -> ADV
{ "RB=target <... {/.*/}", "ADV" },

Expand Down Expand Up @@ -165,7 +168,7 @@ public static void load() {
{"EX", "PRON"},
{"FW", "X"},
{"/^JJ.*$/", "ADJ"},
{"LS", "X"},
{"LS", "NUM"},
{"MD", "AUX"},
{"NNS", "NOUN"},
{"NNP", "PROPN"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,16 @@ private static BufferedReader getBufferedReader(String source) {
"adjoin (NP NN@) newnp\n" +
'\n') +

// Fix not_RB only_JJ, which should generally be not_RB only_RB
// and put it under a CONJP instead of an ADVP
("ADVP|CONJP <1 (__ < /^(?i:not)$/) <2 (JJ=bad < only|just|merely|even) !<3 __\n" +
"relabel bad RB\n" +
'\n') +

("ADVP=bad <1 (__ < /^(?i:not)$/) <2 (RB < only|just|merely|even) !<3 __\n" +
"relabel bad CONJP\n" +
'\n') +

// Fix some cases of 'as well as' not made into a CONJP unit
// There are a few other weird cases that should also be reviewed with the tregex
// well|Well|WELL , as|AS|As . as|AS|As !>(__ > @CONJP)
Expand Down Expand Up @@ -1166,6 +1176,15 @@ private static BufferedReader getBufferedReader(String source) {

"") +

// for structures such as "over a year", "about a decade", etc
("NP < (QP <1 IN=bad <2 (DT !$+ __) $+ /^N/)\n" +
"relabel bad RB\n" +
"\n") +

// for structures such as "just over a decade"
("NP < (QP <1 (RB < just) <2 IN=bad <3 (DT !$+ __) $+ /^N/)\n" +
"relabel bad RB\n" +
"\n") +

("@QP < (IN|JJ|RBR|RP=bad < about)\n" +
"relabel bad RB\n" +
Expand Down Expand Up @@ -1221,6 +1240,10 @@ private static BufferedReader getBufferedReader(String source) {
"relabel bad VBN\n" +
'\n') +

// First, second, third are treated as LS in PTB
// but in UD EWT, GUM, etc they are treated as RB
("@ADVP <: LS=bad\n" +
"relabel bad RB\n\n") +

("@SBAR < (DT|WDT|NN|NNP|RB=bad < that|because|while|Though|Whether)\n" +
"relabel bad IN\n" +
Expand Down
Loading
Loading