diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java b/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java index 542e2219dc..ae9efbd802 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Set; +import edu.stanford.nlp.util.Quadruple; import edu.stanford.nlp.util.Triple; /** @@ -31,8 +32,8 @@ public class NodeAttributes { // for individual elements of that map rather than turn the map into a string // and search on its contents that way. This is especially true since there // is no guarantee the map will be in a consistent order. - // String, String, String: node attribute for a map (such as CoNLLUFeats), key in that map, value to match - private List> contains; + // String, String, String, Boolean: node attribute for a map (such as CoNLLUFeats), key in that map, value to match, negated? + private List> contains; public NodeAttributes() { root = false; @@ -68,15 +69,15 @@ public void setAttribute(String key, String value, boolean negated) { attributes.add(new Triple(key, value, negated)); } - public void addContains(String annotation, String key, String value) { - contains.add(new Triple(annotation, key, value)); + public void addContains(String annotation, String key, String value, Boolean negated) { + contains.add(new Quadruple(annotation, key, value, negated)); } public List> attributes() { return Collections.unmodifiableList(attributes); } - public List> contains() { + public List> contains() { return Collections.unmodifiableList(contains); } } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java index 8a841fc9a8..f7d02b96bc 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java @@ -13,6 +13,7 @@ import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Quadruple; import edu.stanford.nlp.util.Triple; import edu.stanford.nlp.util.logging.Redwood; @@ -91,10 +92,11 @@ public NodePattern(GraphRelation r, boolean negDesc, } } - for (Triple entry : attrs.contains()) { + for (Quadruple entry : attrs.contains()) { String annotation = entry.first(); String key = entry.second(); String value = entry.third(); + boolean negated = entry.fourth(); Class clazz = AnnotationLookup.getValueType(AnnotationLookup.toCoreKey(annotation)); boolean isMap = clazz != null && Map.class.isAssignableFrom(clazz); @@ -105,11 +107,11 @@ public NodePattern(GraphRelation r, boolean negDesc, final Attribute attr; // Add the attributes for this key if (value.equals("__")) { - attr = new Attribute(key, true, true, false); + attr = new Attribute(key, true, true, negated); } else if (value.matches("/.*/")) { - attr = buildRegexAttribute(key, value, false); + attr = buildRegexAttribute(key, value, negated); } else { // raw description - attr = new Attribute(key, value, value, false); + attr = new Attribute(key, value, value, negated); } partialAttributes.add(new Pair<>(annotation, attr)); @@ -239,17 +241,19 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i Class clazz = Env.lookupAnnotationKey(env, annotation); Object rawmap = node.get(clazz); - // if the map is null, it can't possibly match... + final String nodeValue; if (rawmap == null) { - return negDesc; + nodeValue = null; + } else { + if (!(rawmap instanceof Map)) + throw new RuntimeException("Can only use partial attributes with Maps... this should have been checked at creation time!"); + Map map = (Map) rawmap; + + // TODO: allow for regex match on the keys? + Object value = map.get(attr.key); + nodeValue = (value == null) ? null : value.toString(); } - if (!(rawmap instanceof Map)) - throw new RuntimeException("Can only use partial attributes with Maps... this should have been checked at creation time!"); - Map map = (Map) rawmap; - // TODO: allow for regex match on the keys? - Object value = map.get(attr.key); - final String nodeValue = (value == null) ? null : value.toString(); boolean matches = checkMatch(attr, ignoreCase, nodeValue); if (!matches) { return negDesc; diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java index cd5e4b98ac..0b7e5e8741 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java @@ -65,7 +65,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th case 11: case 15: case 17: - case 23:{ + case 24:{ node = SubNode(GraphRelation.ROOT); children.add(node); label_1: @@ -135,7 +135,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th } case 15: case 17: - case 23:{ + case 24:{ result = ModNode(r); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: @@ -397,7 +397,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 15: case 17: - case 23:{ + case 24:{ node = ModNode(reln); break; } @@ -454,7 +454,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th case 14: case 15: case 17: - case 23:{ + case 24:{ ; break; } @@ -485,7 +485,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th boolean startUnderNeg; switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 17: - case 23:{ + case 24:{ child = Child(r); break; } @@ -512,7 +512,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th child = NodeDisj(r); break; } - case 23:{ + case 24:{ child = Description(r); break; } @@ -569,8 +569,22 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th } break; } - case ALIGNRELN:{ - attrType = jj_consume_token(ALIGNRELN); + case ALIGNRELN: + case 23:{ + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case ALIGNRELN:{ + attrType = jj_consume_token(ALIGNRELN); + break; + } + case 23:{ + attrType = jj_consume_token(23); + break; + } + default: + jj_la1[25] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } key = jj_consume_token(IDENTIFIER); jj_consume_token(21); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { @@ -583,7 +597,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[25] = jj_gen; + jj_la1[26] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -591,11 +605,12 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th {if (true) throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + " key=" + key + " value=" + value);} } - attributes.addContains(attr.image, key.image, value.image); + boolean negated = attrType.image.equals("!@"); + attributes.addContains(attr.image, key.image, value.image, negated); break; } default: - jj_la1[26] = jj_gen; + jj_la1[27] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -612,7 +627,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[27] = jj_gen; + jj_la1[28] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -622,7 +637,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th boolean link = false; NodeAttributes attributes = new NodeAttributes(); NodePattern pat; - jj_consume_token(23); + jj_consume_token(24); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER: case EMPTY: @@ -631,24 +646,24 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th label_6: while (true) { switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { - case 24:{ + case 25:{ ; break; } default: - jj_la1[28] = jj_gen; + jj_la1[29] = jj_gen; break label_6; } - jj_consume_token(24); + jj_consume_token(25); AddAttribute(attributes); } break; } default: - jj_la1[29] = jj_gen; + jj_la1[30] = jj_gen; ; } - jj_consume_token(25); + jj_consume_token(26); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 21:{ jj_consume_token(21); @@ -665,7 +680,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[30] = jj_gen; + jj_la1[31] = jj_gen; ; } pat = new NodePattern(r, underNodeNegation, attributes, link, name != null ? name.image : null); @@ -682,13 +697,13 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th public Token jj_nt; private int jj_ntk; private int jj_gen; - final private int[] jj_la1 = new int[31]; + final private int[] jj_la1 = new int[32]; static private int[] jj_la1_0; static { jj_la1_init_0(); } private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x400,0x828808,0x3801c,0x3801c,0x828800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x828800,0x2000,0x82c000,0x4000,0x828000,0x820000,0x400400,0x110,0x110,0x400408,0xd0,0x1000000,0xd0,0x200000,}; + jj_la1_0 = new int[] {0x400,0x1028808,0x3801c,0x3801c,0x1028800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x1028800,0x2000,0x102c000,0x4000,0x1028000,0x1020000,0x400400,0x110,0x800008,0x110,0xc00408,0xd0,0x2000000,0xd0,0x200000,}; } /** Constructor with InputStream. */ @@ -702,7 +717,7 @@ public SemgrexParser(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -716,7 +731,7 @@ public void ReInit(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Constructor. */ @@ -726,7 +741,7 @@ public SemgrexParser(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -744,7 +759,7 @@ public void ReInit(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Constructor with generated Token Manager. */ @@ -753,7 +768,7 @@ public SemgrexParser(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -762,7 +777,7 @@ public void ReInit(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } private Token jj_consume_token(int kind) throws ParseException { @@ -813,12 +828,12 @@ private int jj_ntk_f() { /** Generate ParseException. */ public ParseException generateParseException() { jj_expentries.clear(); - boolean[] la1tokens = new boolean[26]; + boolean[] la1tokens = new boolean[27]; if (jj_kind >= 0) { la1tokens[jj_kind] = true; jj_kind = -1; } - for (int i = 0; i < 31; i++) { + for (int i = 0; i < 32; i++) { if (jj_la1[i] == jj_gen) { for (int j = 0; j < 32; j++) { if ((jj_la1_0[i] & (1<) "=" (value = | value = ) + (attrType = "@" | attrType = "!@") (key = ) "=" (value = | value = ) { if (attr == null || key == null || value == null) { throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + " key=" + key + " value=" + value); } - attributes.addContains(attr.image, key.image, value.image); + boolean negated = attrType.image.equals("!@"); + attributes.addContains(attr.image, key.image, value.image, negated); }) ) | diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java index 7a55891f0c..cad0f272ea 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java @@ -55,6 +55,7 @@ interface SemgrexParserConstants { "\"~\"", "\"=\"", "\"!:\"", + "\"!@\"", "\"{\"", "\";\"", "\"}\"", diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java index e3fe4d9933..4fe38b9910 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java @@ -47,7 +47,7 @@ private int jjMoveStringLiteralDfa0_0(){ return jjStopAtPos(0, 9); case 33: jjmatchedKind = 15; - return jjMoveStringLiteralDfa1_0(0x400000L); + return jjMoveStringLiteralDfa1_0(0xc00000L); case 35: return jjStopAtPos(0, 6); case 36: @@ -63,7 +63,7 @@ private int jjMoveStringLiteralDfa0_0(){ case 58: return jjStopAtPos(0, 10); case 59: - return jjStopAtPos(0, 24); + return jjStopAtPos(0, 25); case 61: return jjStartNfaWithStates_0(0, 21, 2); case 63: @@ -75,11 +75,11 @@ private int jjMoveStringLiteralDfa0_0(){ case 93: return jjStopAtPos(0, 18); case 123: - return jjStopAtPos(0, 23); + return jjStopAtPos(0, 24); case 124: return jjStopAtPos(0, 13); case 125: - return jjStopAtPos(0, 25); + return jjStopAtPos(0, 26); case 126: return jjStopAtPos(0, 20); default : @@ -98,6 +98,10 @@ private int jjMoveStringLiteralDfa1_0(long active0){ if ((active0 & 0x400000L) != 0L) return jjStopAtPos(1, 22); break; + case 64: + if ((active0 & 0x800000L) != 0L) + return jjStopAtPos(1, 23); + break; default : break; } @@ -358,7 +362,7 @@ else if (curChar < 128) public static final String[] jjstrLiteralImages = { "", null, null, "\100", null, null, "\43", "\44", null, "\12", "\72", "\50", "\51", "\174", "\46", "\41", "\77", "\133", "\135", "\54", "\176", "\75", "\41\72", -"\173", "\73", "\175", }; +"\41\100", "\173", "\73", "\175", }; protected Token jjFillToken() { final Token t; @@ -595,10 +599,10 @@ public void SwitchTo(int lexState) /** Lex State array. */ public static final int[] jjnewLexState = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, + -1, -1, }; static final long[] jjtoToken = { - 0x3fffffdL, + 0x7fffffdL, }; static final long[] jjtoSkip = { 0x2L, diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java index 6c4b7e7502..b74d2e3b58 100644 --- a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java +++ b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java @@ -249,6 +249,29 @@ public void testContainsExpression() { runTest(pattern, graph, "D", "F"); } + public void testContainsRegexExpression() { + // morphofeatures is a Map, so this should work + SemanticGraph graph = makeComplicatedGraph(); + Set vertices = graph.vertexSet(); + for (IndexedWord iw : vertices) { + if (iw.value().equals("B") || iw.value().equals("D") || iw.value().equals("F")) { + CoNLLUFeatures feats = new CoNLLUFeatures(); + feats.put("foo", "bar" + iw.value()); + iw.set(CoreAnnotations.CoNLLUFeats.class, feats); + } + } + + // test a positive regex + SemgrexPattern pattern = SemgrexPattern.compile("{morphofeatures@foo=/bar[BD]/}"); + runTest(pattern, graph, "B", "D"); + + // test a negative regex + // should match both the ones that don't have features + // and the ones that have a non-matching feature + pattern = SemgrexPattern.compile("{morphofeatures!@foo=/bar[BD]/}"); + runTest(pattern, graph, "A", "C", "E", "F", "G", "H", "I", "J"); + } + public void testReferencedRegex() { runTest("{word:/Bill/}", "[ate subj>Bill obj>[bill det>the]]", "Bill");