diff --git a/documentation/Cases_review.md b/documentation/Cases_review.md new file mode 100644 index 0000000..ad86592 --- /dev/null +++ b/documentation/Cases_review.md @@ -0,0 +1,81 @@ + +Amélioration de nsubj/dobj avec instance_of +=========================================== + +#### instance_of + nsubj(pass) + +* nsubjpass + prep_in : What language is spoken in Argentina? +* nsubj + dobj : What actor married John F. Kennedy's sister? +* nsubj + prep_by : List movies directed by Spielberg +* nsubjpass : Which president has been killed by Oswald? +* nsubjpass : which book was authored by Victor Hugo + +#### instance_of + dobj + +* Which books did Suzanne Collins write? +* How many films did Ingmar Bergman make? +* How many children does Barack Obama have? +* How many gas stations are there in the United States? + +#### nsubj avec verbe nécessaire + +* What is the most beautiful country in Europe? +* Who was the first Taiwanese President? +* What was the monetary value of the Nobel Peace Prize in 1989? +* When was Benjamin Disraeli prime minister? +* nsubjpass : Where was Ulysses S. Grant born? +* nsubjpass : Where is Inoco based? +* What was the first Gilbert and Sullivan opera? +* Where is the ENS of Lyon? +* What did Bob write ? +* Who is the author of Sea and Sky? +* Is there a ghost in my house +* Are there computers in your room + +#### Question word nsubj + +No subject after preprocessing + +* Who wrote the song, "Stardust"? +* Who invented the hula hoop? +* Who elected the president ? +* Who was killed by Oswald? + +#### Autres + +* tmod : which day was the president born +* prep_of : Of which country is Paris the capital? > mal parsé +* prep_in : In which countries is the Lake Victoria? https://www.google.fr/webhp?sourceid=chrome-instant&ion=1&espv=2&ie=UTF-8#q=%22in+which+countries%22 +* prep_from : From which country is Alan Turing? + +_________________________________________________________________________________________________________________________________ + + +Exists +====== + +* Is there a ghost in my house +* Is there a pilot in the plane +* Is there a capital in France +* Is there a king of england > https://www.wikidata.org/wiki/Q18810062 +* http://english.stackexchange.com/questions/34353/is-there-versus-are-there +* Are there any articles available on the subject? +* Are there computers in your room +* Does a king of England exist? + +_________________________________________________________________________________________________________________________________ + +Semi question words +=================== + +* Show me Star Wars movies > mal parsé +* List movies directed by Spielberg +* List books by Roald Dahl +* List albums of Pink Floyd +* List films with Jack Nicholson +* List of US presidents +* List of presidents of France +* Give me the capital of France +* Give the capital of France +* Give us the capital of France +* list of president of usa > mal parsé diff --git a/documentation/General_questions.md b/documentation/General_questions.md index 8f8930b..a2c8eaf 100644 --- a/documentation/General_questions.md +++ b/documentation/General_questions.md @@ -1,14 +1,12 @@ General ======= -* Yes/no question: product dobj relation? * verb+ing: do sthg special (look POS tag)? : What did Richard Feynman say upon hearing he would receive the Nobel Prize in Physics? * If nounification becomes powerful enough: use it to analyse superlative (biggest > size...) * Multiple words : Where is Inoco based? > base + place = base place :( >> en fait "base" se nounifie en "place" ? * Article : enlever numéro apparaissant dans noeud (et idem pour arbre) / enlever encadrement * data model : autoriser des listes de prédicats dans les sort ? * réecrire demo3 pour le rendre dépendant de DependencyTree -* t5 peut être enlevé ? * Dans le question word processing (et plus généralement) : les connecteurs ne sont pas uniquement les 1000. Les 1000 prennent les conj mais pas le superlatives. * Travailler sur la forme normalisée? >> garantir qu'en entrée de normalize chaque noeud contient une seule alternative @@ -20,6 +18,19 @@ General * Multiple predicates pour les sort * Tell me where the DuPont company is located. Name the Ranger who was always after Yogi Bear. * How do you solve "Rubik's Cube"? > en quoi est transformé how +* réduire le nb de map, ajouter + d'infos +* autres auxiliaire (have) : What dictator has the nickname "El Maximo" +* propagation de types : nsubjRule + qw in strongQuestionWord = R5s +* Who was the leader of the Branch Davidian Cult confronted by the FBI in Waco, Texas in 1993? >> gros sujet +* Where is Inoco based? >> revoir la nounification associée +* Who Clinton defeated? >> prq nounification échoue ? non lemmatizé ? +* Rapprocher/renommer les règles R.. similaires +* __How many__ : opérateur de comptage + > How many films did Ingmar Bergman make? + > How many children does Barack Obama have? + > How many gas stations are there in the United States? + > cf instance_of sur dobj >> on récupère la liste produite en sortie et on renvoie sa taille + > How much did Mercury spend on advertising in 1993? Remarks ======= @@ -33,7 +44,7 @@ Gestion des prep(c)_x ===================== go -prep_to-> ... = go to -prep-> -What is Frozen based on? +* What is Frozen based on? * What two US biochemists won the Nobel Prize in medicine in 1992? @@ -49,55 +60,178 @@ Superlative Yes/No questions ================ -Exists -====== +* Yes/no question: product dobj relation? +* nsubj + prep_from : Are you from Germany? > (you,origin,Germany) > yes/no : (subj | pred:be from, do..live | cpt) -* Is there a ghost in my house -* Is there a pilot in the plane -* Is there a capital in France -* Is there a king of england > https://www.wikidata.org/wiki/Q18810062 -* http://english.stackexchange.com/questions/34353/is-there-versus-are-there -* Are there any articles available on the subject? -* Are there computers in your room -* Does a king of England exist? +Conjonction +=========== -Semi question words +Mauvais : + * What was the first Gilbert and Sullivan opera? + +Exemples : +---------- +* Who makes and distributes bells? +* Who is the author of Sea and Sky? +* What percentage of the world's plant and animal species can be found in the Amazon forests? +* Good: Who is section manager for guidance and control systems at JPL? +* Bad: How many people did the United Nations commit to help restore order and distribute humanitarian relief in Somalia in September 1992? +* Bad: Which Italian city is home to the Cathedral of Santa Maria del Fiore or the Duomo? + +Problem with merging: +--------------------- +* What is the length of border between the Ukraine and Russia? + +Comment construire les sous arbres +---------------------------------- +* What was the first Gilbert and Sullivan opera? +* When was General Manuel Noriega ousted as the leader of Panama and turned over to U.S. authorities? +* When did Princess Diana and Prince Charles get married? +* When did the royal wedding of Prince Andrew and Fergie take place? +* ++ How many people did the United Nations commit to help restore order and distribute humanitarian relief in Somalia in September 1992? + >> peut être propager les prep après ? + >> même problème que pour les nn + +Merge nn with the 2 nodes if nn above them: + - When did Princess Diana and Charles get married? + - When did Princess Diana and Prince Charles get married? + - Who is section manager for guidance and control systems at JPL? + +_________________________________________________________________________________________________________________________________ +_________________________________________________________________________________________________________________________________ + +Améliorer la MWE recognition +============================ + +Rattraper un mauvais parsing: + * who is the president of the United states of america + * Where is the ENS of Lyon? (merge car majuscule?) + +Good: + * Who is the United States president + * What was the first Gilbert and Sullivan opera? + * Obama is the United States president. + +Amod: + * Who is the French president? >> nécessite avant de transformer French en France + * Who was the first Taiwanese President? + +What organization was founded by the Rev. Jerry Falwell? >> tagger Rev car majuscule + +_________________________________________________________________________________________________________________________________ + +Traitement des prep =================== -* Show me Star Wars movies +Passer prep en Rnew + * verbe auxiliaire : + - + * verbe non auxiliaire : + - List movies directed by Spielberg + - What language is spoken in Argentina? :( + - What kings ruled on France? + - Who was born on 1984? + * nom : + - List of books by Roald Dahl + - president of France + +_________________________________________________________________________________________________________________________________ + +### nsubj + +R5 +== + +* Where does the president live? + +R3 +== + +R5 ou R3 +======== + +* What did George Orwell write? +* Which books did Suzanne Collins write? + +### nsubpass + +R5 +== + +R3 +== + +R5 ou R3 +======== + +* Where was Ulysses S. Grant born? +* Where is Inoco based? + +### agent + +R5 +== + +R3 +== + +R5 ou R3 +======== + +* Who was killed by Oswald? +* Which president has been killed by Oswald? +* Which books were authored by Victor Hugo? + +---------------- + +### dobj + +R5 +== + +R3 +== + +R5 ou R3 +======== + +* Who developed Microsoft? +* What actor married John F. Kennedy's sister? +* Who has written "The Hitchhiker's Guide to the Galaxy"? +* Who wrote the song, "Stardust"? +* Who invented the hula hoop? +* Who elected the president ? +* Who killed Gandhi? + +### prep (+ V) + +R5 +== + +R3 +== + +* Which kings ruled on France * List movies directed by Spielberg -* List books by Roald Dahl -* List albums of Pink Floyd -* List films with Jack Nicholson -* List of US presidents -* List of presidents of France -* Give me the capital of France -* Give the capital of France -* Give us the capital of France - -Racine à fils multiples -======================= -* nsubj + prep_from : Are you from Germany? > (you,origin,Germany) > yes/no : (subj | pred:be from, do..live | cpt) -* nsubj + prep_by : List movies directed by Spielberg -* prep_of + prep_of : list of president of usa -* nsubj + prep_by : List movies directed by Spielberg +R5 ou R3 +======== + +* What language is spoken in Argentina? +* Who followed Willy Brandt as chancellor of the Federal Republic of Germany? +* Who was born on 1984 + +---------------- +The animal | lives in | the farm. + Subject Predicate Object >> ( animal , residence , farm ) -instance of: +The animal | lives in | the farm. + Object Predicate Subject >> ( farm , inhabitant , animal ) -* prep_from + prep_to + prep_on : carpool from Lyon to Paris on December 31 > (?, instance of, carpool) ∩ (?, from, Paris) ∩ (?, to, Lyon) ∩ (?,day, December 31st) -* nsubjpass + prep_in : What language is spoken in Argentina? > (Argentina, language, ?) -* nsubj + dobj : Which books did Suzanne Collins write? > (Suzanne Collins, author, ?) + typage "book" sur ? -* nsubj + dobj (+ do) : What albums did Pearl Jam record? -* nsubj + dobj : What dictator has the nickname "El Maximo"? -* nsubj + dobj : What actor married John F. Kennedy's sister? > (?, instance of, actor) ∩ (?, wife, (John F. Kennedy, sister, ?)) -* nsubj + prep_in : How many gas stations are there in the United States? +--------------- -* voir Problematic questions dans hierarchy review +processQuestionInfo dans questionWordProcessing doit être le seul habilité à affaiblir une règle en R2 (ou R3 bis) (mais pas en R0: where is the residence) +dependency analysis pose un R5/R3 puis processQuestionInfo affaiblie les dépendances de plus haut niveau s'il trouve l'info en-dessous -Amélioration des question maps -============================== -* How much : ajouter cost -* Plus généralement : réduire le nb de map, ajouter + d'infos diff --git a/documentation/Hierarchy_review.md b/documentation/Hierarchy_review.md index 2ebc859..a2a8b63 100644 --- a/documentation/Hierarchy_review.md +++ b/documentation/Hierarchy_review.md @@ -10,8 +10,6 @@ Please, don't remove any example (they are used frequently to check the whole al * appos * dep (no hope... the stanford parser needs to be improved/trained) -* dobj -* nsubj ### Problematic questions @@ -20,22 +18,16 @@ Please, don't remove any example (they are used frequently to check the whole al * How far is Yaroslavl from Moscow? * What effect does a prism have on light? * Where was the movie "Somewhere in Time" filmed? (not always the same result?) -* What city is Purdue University in? -* Who is the author of the book, "The Iron Lady : A Biography of Margaret Thatcher"? >> problem of redundancy (book + title of the book) -* What U.S. state is Fort Knox in? * What country in Latin America is the largest one? * He is the biggest and fattest man >> problem with amod and 2*conj * Who held the endurance record for women pilots in 1929? >> problem with for * How many people that live in China speak english? * How many USA presidents have visited Iran? -* Which movies does Quentin Tarantino star in? * Which movies did Quentin Tarantino direct, but not star in? * Who receives the Nobel Prize in Physics in 2000? * When did Diana and Charles get married? * Where is Mozambique located? > location/place * Who built the first pyramid? > consider "pyramide" as (single) triple / predicate -* Who wrote the book, "Huckleberry Finn"? -* What kind of animal is Babar? * Who was the first Taiwanese President? * What is the brightest star visible from Earth? @@ -48,11 +40,23 @@ Current rule: don't merge/remove appos * Who came up with the name, El Nino? * Who wrote the song, "Stardust"? > (sometimes dep instead of appos) replace the father by the son || or R5 (or R2) rule? +* Who wrote the book, "Huckleberry Finn"? +* Who is the author of the book, "The Iron Lady : A Biography of Margaret Thatcher"? >> problem of redundancy (book + title of the book) + +prt +=== + +* Who came up with the name, El Nino? xcomp ===== * What did John Hinckley do to impress Jodie Foster? +* Obama is the United States president. + +##### +++ + +* Who developed Skype amod ==== @@ -91,6 +95,7 @@ Current rule: merge * What two US biochemists won the Nobel Prize in medicine in 1992? * Who is the US president? * When was Benjamin Disraeli prime minister? +* What dictator has the nickname "El Maximo"? ##### +++ @@ -103,7 +108,7 @@ nsubjpass ##### --- -* Which president has been killed by Oswald? > remove nsubjpass +* Which president has been killed by Oswald? ##### +++ @@ -111,6 +116,13 @@ nsubjpass * Where is Inoco based? * Where was George Washington born? +agent +===== + +* Which president has been killed by Oswald? +* Who was killed by Oswald? +* which book was authored by Victor Hugo + cop === @@ -121,6 +133,8 @@ cop doesn't always disappear -> needs to remove it manually * What is the brightest star visible from Earth? >> cop not removed! change what <-> is * Who is the president black and blue? * What is black and white? +* What is the UN headquarter? +* What is the United States national day? prep ==== @@ -145,20 +159,19 @@ dobj * What did John Hinckley do to impress Jodie Foster? * When did they won the lottery? * What two US biochemists won the Nobel Prize in medicine in 1992? -* How many films did Ingmar Bergman make? >> with nsubj +* How many films did Ingmar Bergman make? * Who has written "The Hitchhiker's Guide to the Galaxy"? * Who wrote "The Hitchhiker's Guide to the Galaxy"? * Who invented the hula hoop? * Who killed Gandhi? -* Who elected the president ? +* Who elected the president? look at the verb ? (passive, acted ...) ##### --- -* Who held the endurance record for women pilots in 1929? -* How many children does Barack Obama have? > not do an intersection each time a node have several children. -* Which books did Suzanne Collins write? (?) +* How many children does Barack Obama have? +* Which books did Suzanne Collins write? ccomp ===== @@ -200,23 +213,37 @@ vmod nsubj ===== -- only if a = is/was/... ? -- when is/... is replaced, it's more relevant to produce a R5 rule -- sometimes it's relevent to produce an "instance of" triple. -- not "instance of" only if is/was/... ? -- new rules Rnsubj ???? +* Who is Obama +* Which books did Suzanne Collins write? +* How many films did Ingmar Bergman make? +* Who Clinton defeated? +* Where does the prime minister of United Kingdom live? + +nsubj (Rnew): + +verbe auxiliaire : + - Who is Obama +verbe non auxiliaire : (actuellement perdu si pas strong qw) + - Which books did Suzanne Collins write? + - How many films did Ingmar Bergman make? + - Who Clinton defeated? + - What did Bob write ? +nom : + - ne devrais pas arriver ##### +++ * Who elected the president of France? -* What was the first Gilbert and Sullivan opera? >> problem or parsing failure ??? +* What was the first Gilbert and Sullivan opera? * Who Clinton defeated? -* Where is the ENS of Lyon? >> problem or parsing failure ??? +* Where is the ENS of Lyon? * What did Bob write ? > R3 if weak question word + not 'identity' >> problem or parsing failure ??? +* What actor married John F. Kennedy's sister ##### --- -* What actor married John F. Kennedy's sister +* What does "Janelle" mean? + num === @@ -239,37 +266,6 @@ conj * When did Rococo painting and architecture flourish? -conj_and -======== - -Exemples : ----------- -* Who makes and distributes bells? -* Who is the author of Sea and Sky? -* What percentage of the world's plant and animal species can be found in the Amazon forests? -* Good: Who is section manager for guidance and control systems at JPL? -* Bad: How many people did the United Nations commit to help restore order and distribute humanitarian relief in Somalia in September 1992? -* Bad: Which Italian city is home to the Cathedral of Santa Maria del Fiore or the Duomo? - -Problem with merging: ---------------------- -* What is the length of border between the Ukraine and Russia? - -Comment construire les sous arbres ----------------------------------- -* What was the first Gilbert and Sullivan opera? -* When was General Manuel Noriega ousted as the leader of Panama and turned over to U.S. authorities? -* When did Princess Diana and Prince Charles get married? -* When did the royal wedding of Prince Andrew and Fergie take place? -* ++ How many people did the United Nations commit to help restore order and distribute humanitarian relief in Somalia in September 1992? - >> peut être propager les prep après ? - >> même problème que pour les nn - -Merge nn with the 2 nodes if nn above them: - - When did Princess Diana and Charles get married? - - When did Princess Diana and Prince Charles get married? - - Who is section manager for guidance and control systems at JPL? - pcomp ===== @@ -290,6 +286,9 @@ tmod ==== * Are there 29 days in February +* which day was the president born + +_________________________________________________________________________________________________________________________________ Stanford Parser fails ===================== @@ -306,3 +305,17 @@ Stanford Parser fails * Which movies did Quentin Tarantino direct, but not star in? * Are there beers in Germany? * Show me Star Wars movies +* What country is the biggest producer of tungsten? +* How long did the Charles Manson murder trial last? +* What kind of animal is Babar? +* Which movies does Quentin Tarantino star in? +* What U.S. state is Fort Knox in? +* What city is Purdue University in? +* When was Benjamin Disraeli prime minister? +* list of president of usa +* Show me Star Wars movies +* Who held the endurance record for women pilots in 1929? +* What dictator has the nickname "El Maximo"? +* Of which country is Paris the capital? +* List of books by Roald Dahl. +* What albums did Pearl Jam record? diff --git a/nounification/nounifyMe.md b/nounification/nounifyMe.md index e6ae9c2..65b376f 100644 --- a/nounification/nounifyMe.md +++ b/nounification/nounifyMe.md @@ -5,4 +5,5 @@ - build : constructor - lead : leader - play : player, musician, actor - +- make : realisator (How many films did Ingmar Bergman make?) +- Who Clinton defeated? diff --git a/ppp_questionparsing_grammatical/__init__.py b/ppp_questionparsing_grammatical/__init__.py index a35a280..5ff65e2 100644 --- a/ppp_questionparsing_grammatical/__init__.py +++ b/ppp_questionparsing_grammatical/__init__.py @@ -2,7 +2,7 @@ from ppp_libmodule import HttpRequestHandler from .preprocessingMerge import mergeNamedEntityTagChildParent, mergeNamedEntityTagSisterBrother, mergeNamedEntityTag -from .preprocessing import Word, DependenciesTree, computeTree, QuotationHandler +from .preprocessing import Word, DependenciesTree, computeTree, QuotationHandler, correctTree from .questionWordProcessing import identifyQuestionWord from .dependencyAnalysis import simplify from .normalization import normalize @@ -16,4 +16,4 @@ def app(environ, start_response): return HttpRequestHandler(environ, start_response, RequestHandler) \ .dispatch() -__all__ = ['DependenciesTree','computeTree','QuotationHandler','mergeNamedEntityTagChildParent','mergeNamedEntityTagSisterBrother','mergeNamedEntityTag','simplify', 'identifyQuestionWord','normalize','QuotationError','NounificationError','GrammaticalError','Nounificator'] +__all__ = ['DependenciesTree','computeTree','QuotationHandler','correctTree','mergeNamedEntityTagChildParent','mergeNamedEntityTagSisterBrother','mergeNamedEntityTag','simplify', 'identifyQuestionWord','normalize','QuotationError','NounificationError','GrammaticalError','Nounificator'] diff --git a/ppp_questionparsing_grammatical/data/exceptions.py b/ppp_questionparsing_grammatical/data/exceptions.py index 04f1719..dd0f788 100644 --- a/ppp_questionparsing_grammatical/data/exceptions.py +++ b/ppp_questionparsing_grammatical/data/exceptions.py @@ -8,11 +8,6 @@ def __init__(self, expression, message): self.expression = expression self.message = message -class QuestionWordError(Exception): - def __init__(self, expression, message): - self.expression = expression - self.message = message - class NounificationError(Exception): def __init__(self, expression, message): self.expression = expression diff --git a/ppp_questionparsing_grammatical/data/nounificationManual.pickle b/ppp_questionparsing_grammatical/data/nounificationManual.pickle index 0cf1027..d2553cd 100644 Binary files a/ppp_questionparsing_grammatical/data/nounificationManual.pickle and b/ppp_questionparsing_grammatical/data/nounificationManual.pickle differ diff --git a/ppp_questionparsing_grammatical/data/questionWord.py b/ppp_questionparsing_grammatical/data/questionWord.py index a743446..ce89773 100644 --- a/ppp_questionparsing_grammatical/data/questionWord.py +++ b/ppp_questionparsing_grammatical/data/questionWord.py @@ -1,38 +1,41 @@ -""" - Taken from: http://www.interopia.com/education/all-question-words-in-english/ - Yes/no question -""" + +########################### +# Possible question words # +########################### + +# Open-ended questions + What... for, What... like, Why don't, Where from + Rarely used: Wherefore, Whatever, Wherewith, Whither, Whence, However +openQuestionWord = [ + 'list', 'what', 'what kind', 'what type', 'what sort', 'what time', 'when', 'why', 'where', 'who', 'how', 'how much', 'how many', 'how old', 'how far', 'how long', 'how tall', 'how deep', 'how wide', 'how fast', 'how often', 'how come', 'which', 'whom', 'whose', 'how big', 'of which', 'in which', 'from which' +] + +# Yes/no questions closeQuestionWord = [ 'is', 'are', 'am', 'was', 'were', 'will', 'do', 'does', 'did', 'have', 'had', 'has', 'can', 'could', 'should', 'shall', 'may', 'might', 'would' ] +# Exists questions existQuestionWord = [ 'is there', 'are there' ] +# Other questions semiQuestionWord = [ - 'show me', 'show them', 'show us', 'show him', 'show her', 'give me', 'give them', 'give us', 'give him', 'give her' + 'show me', 'show them', 'show us', 'show him', 'show her', 'give me', 'give them', 'give us', 'give him', 'give her', 'list of', 'give', 'show' ] -# semi questions words that cannot be removed from the tree -predicateQuestionWord = [ - 'list of', 'give', 'show' -] +######################### +# Other classifications # +######################### -""" - Open-ended questions - + What... for, What... like, Why don't, Where from - Rarely used: Wherefore, Whatever, Wherewith, Whither, Whence, However -""" -openQuestionWord = [ - 'list', 'what', 'what kind', 'what type', 'what sort', 'what time', 'when', 'why', 'where', 'who', 'how', 'how much', 'how many', 'how old', 'how far', 'how long', 'how tall', 'how deep', 'how wide', 'how fast', 'how often', 'how come', 'which', 'whom', 'whose', 'how big' -] - -# question word that implies to add an extra triple (in practice: rule R2 vs R5s) (ex: where is the capital of france : (france,capital,?) --> ((france,capital,?),location,?) +# question word that implies to add an extra triple (in practice: rule R2 vs R5) (ex: where is the capital of france : (france,capital,?) --> ((france,capital,?),location,?) strongQuestionWord = [ - 'what kind', 'what type', 'what sort', 'what time', 'when', 'why', 'where', 'how', 'how much', 'how many', 'how old', 'how far', 'how long', 'how tall', 'how deep', 'how wide', 'how fast', 'how often', 'how come', 'whose', 'how big' + 'what kind', 'what type', 'what sort', 'what time', 'when', 'why', 'where', 'how', 'how much', 'how many', 'how old', 'how far', 'how long', 'how tall', 'how deep', 'how wide', 'how fast', 'how often', 'how come', 'whose', 'how big', 'in which', 'from which' ] +####################### +# Question words maps # +####################### + questionExcept = { # words that already contain the info of the question word 'what type' : ['type','sort'], @@ -40,7 +43,7 @@ 'what time' : ['time','date','day','month','year'], 'when' : ['time','date','day','month','year'], 'why' : ['reason','cause','origin'], - 'where' : ['place','location','residence','site'], + 'where' : ['place','location','residence','site','country'], 'how' : ['manner','way'], 'how much' : ['amount','quantity','number'], 'how many' : ['amount','quantity','number'], @@ -54,7 +57,9 @@ 'how often' : ['frequency'], 'how come' : ['reason'], 'whose' : ['owner'], - 'how big' : ['size'] + 'how big' : ['size'], + 'in which' : ['place','location','residence','site','country'], + 'from which' : ['place','location','residence','site','citizenship','nationality','country of citizenship','country'], } questionAdd = { @@ -64,7 +69,7 @@ 'what time' : ['time','date'], 'when' : ['time','date'], 'why' : ['reason','cause','origin'], - 'where' : ['place','location','residence'], + 'where' : ['place','location','residence','country'], 'how' : ['manner'], 'how much' : ['amount','quantity','number'], 'how many' : ['amount','quantity','number'], @@ -79,7 +84,9 @@ 'how come' : ['reason'], #'which' : ['choice'], 'whose' : ['owner'], - 'how big' : ['size'] + 'how big' : ['size'], + 'in which' : ['place','location','residence','country'], + 'from which' : ['place','location','residence','origin','citizenship','nationality','country of citizenship','country'] } @@ -92,7 +99,7 @@ 'what time' : ['time'], 'when' : ['date'], 'why' : ['reason'], - 'where' : ['place','location','residence'], + 'where' : ['place','location','residence','country'], 'who' : ['identity'], 'how' : ['manner'], 'how much' : ['amount'], @@ -109,7 +116,9 @@ 'which' : ['choice'], # ? 'whom' : ['identity'], 'whose' : ['owner'], - 'how big' : ['size'] + 'how big' : ['size'], + 'in which' : ['place','location','residence','country'], + 'from which' : ['place','location','residence','origin','citizenship','nationality','country of citizenship','country'] } questionType = { @@ -130,5 +139,7 @@ 'how often' : 'NUMBER', 'whom' : 'PERSON', 'whose' : 'PERSON', - 'how big' : 'NUMBER' + 'how big' : 'NUMBER', + 'in which' : 'LOCATION', + 'from which' : 'LOCATION', } diff --git a/ppp_questionparsing_grammatical/dependencyAnalysis.py b/ppp_questionparsing_grammatical/dependencyAnalysis.py index 75701a6..e4ca11b 100644 --- a/ppp_questionparsing_grammatical/dependencyAnalysis.py +++ b/ppp_questionparsing_grammatical/dependencyAnalysis.py @@ -26,48 +26,55 @@ def amodRule(t,qw): merge(t.child[0],qw) t.dependency = 'connectorUp' return - if t.namedEntityTag != 'ORDINAL' and t.wordList[0][0].pos != 'JJS': # [0] : must be improve? (search in the whole list?) + if t.namedEntityTag != 'ORDINAL' and t.wordList[0][0].pos != 'JJS': # [0] : must be improved? (search in the whole list?) assert t.parent is not None merge(t,qw) else: t.dependency = 'connectorUp' -def nsubjRule(t,qw): - if qw in strongQuestionWord: # or len(t.child) == 0: # Warning: length can decrease during analysis > needs also the R2 tag - t.dependency = 'R5s' # same as R5 except that types are propagated - elif t.parent.getWords() != ['identity']: - t.dependency = 'R3' +def nnRule(t,qw): + if t.namedEntityTag != t.parent.namedEntityTag and t.namedEntityTag != 'undef': + t.dependency = 'R5' else: + merge(t,qw) + +def mixedRule(t,qw): + if (t.parent.getWords() == ['identity'] and qw in strongQuestionWord) or not t.parent.wordList[0][0].pos.startswith('V'): + t.dependency = 'R5' + elif t.parent.getWords() == ['identity']: t.dependency = 'R2' - + else: + t.dependency = 'R3' + dependenciesMap1 = { 'undef' : 'R0', 'root' : 'R0', - 'dep' : 'R1', # ? instead of R2 + 'inst_of' : 'R6', # << + 'dep' : 'R1', 'aux' : remove, 'auxpass' : remove, 'cop' : impossible, 'arg' : impossible, - 'agent' : 'R5', + 'agent' : 'R3', 'comp' : 'R3', 'acomp' : 'R3', 'ccomp' : 'R5', - 'xcomp' : 'R3', + 'xcomp' : 'R5', 'pcomp' : 'R3', 'obj' : impossible, - 'dobj' : 'R5', #_+ instead of R5 + 'dobj' : 'R5', 'iobj' : 'R3', - 'pobj' : 'R3', # - + 'pobj' : 'R3', 'subj' : impossible, - 'nsubj' : nsubjRule, - 'nsubjpass' : 'R5', #_+ ? instead of R4 + 'nsubj' : mixedRule, # << + 'nsubjpass' : 'R5', # or R2 if necessary 'csubj' : impossible, 'csubjpass' : impossible, 'cc' : impossible, 'conj' : 'R0', 'conj_and' : ignore, 'conj_or' : ignore, - 'conj_negcc': ignore, #? + 'conj_negcc': ignore, 'expl' : remove, 'mod' : 'R4', 'amod' : amodRule, @@ -81,19 +88,18 @@ def nsubjRule(t,qw): 'mark' : remove, 'advmod' : 'R2', 'neg' : 'connectorUp', # need a NOT node - 'rcmod' : 'R4', # temp, need to be analyzed + 'rcmod' : 'R4', 'quantmod' : remove, - 'nn' : merge, + 'nn' : nnRule, 'npadvmod' : 'R5', 'tmod' : 'R3', 'num' : merge, 'number' : merge, - 'prep' : 'R5', # ? - 'prepc' : 'R5', # ? + 'prep' : mixedRule, # << 'poss' : 'R5', 'possessive': impossible, 'prt' : merge, - 'parataxis' : remove, # ? + 'parataxis' : remove, 'punct' : impossible, 'ref' : impossible, 'sdep' : impossible, @@ -119,8 +125,8 @@ def propagateType(t,qw): 'R3' : ignore, # (?,!a,normalize(c)) 'R4' : ignore, # (?,normalize(c),!a) 'R5' : ignore, # (normalize(c),!a,?) - 'R5s' : propagateType, # (normalize(c),!a,?) - #'R6' : ignore, # (!a,normalize(c),?) # not use for the moment + 'R6' : propagateType, # (?,instance of,c) + 'R7' : ignore, # (!a,normalize(c),?) 'Rspl' : propagateType, # superlative 'RconjT' : propagateType, # top of a conjunction relation 'RconjB' : propagateType, # bottom of a conjunction relation @@ -155,7 +161,6 @@ def collapsePrep(t): for c in temp: collapsePrep(c) if t.dependency.startswith('prep'): # prep_x or prepc_x (others?) - # prep = t.dependency[t.dependency.index('_')+1:] # not used for the moment t.dependency = 'prep' # suffix of the prep not analyzed for the moment (just removed) def connectorUp(t): @@ -193,21 +198,21 @@ def conjConnectorsUp(t): dupl = None newTree = None if len(t.parent.child) == 1: - parentTemp = t.parent.parent # n0 - t.dependency = t.parent.dependency # dependency(n2) - t.parent.child.remove(t) # son(n1) \= n2 - dupl = deepcopy(parentTemp) # n0' - parentTemp.child.remove(t.parent) # son(n0) \= n1 - parentTemp.child.append(t) # son(n0)=n2 - t.parent = parentTemp # parent(n2) = n0 + parentTemp = t.parent.parent + t.dependency = t.parent.dependency + t.parent.child.remove(t) + dupl = deepcopy(parentTemp) + parentTemp.child.remove(t.parent) + parentTemp.child.append(t) + t.parent = parentTemp newTree = DependenciesTree(depSave, dependency=parentTemp.dependency, child=[dupl,parentTemp], parent=parentTemp.parent) parentTemp.dependency = 'RconjB' parentTemp.parent = newTree else: - parentTemp = t.parent # n0 - parentTemp.child.remove(t) # son(n1) \= n2 - dupl = deepcopy(parentTemp) # n0' - t.child += t.parent.child # son(n2) = son(n1) + parentTemp = t.parent + parentTemp.child.remove(t) + dupl = deepcopy(parentTemp) + t.child += t.parent.child for n in t.child: n.parent = t newTree = DependenciesTree(depSave, dependency=parentTemp.dependency, child=[dupl,t], parent=parentTemp.parent) @@ -225,7 +230,7 @@ def subStandardize(t,lmtzr): for c in t.child: subStandardize(c,lmtzr) if t.namedEntityTag == 'undef': - assert len(t.wordList) == 1 and len(t.wordList[0]) == 1 # only [0][0] ? + assert len(t.wordList) == 1 and len(t.wordList[0]) == 1 # len(t.wordList[0])=1 because the wordList of size>1 have been built by NER merging w = t.wordList[0][0] l = w.standardize(lmtzr) if l !=[]: diff --git a/ppp_questionparsing_grammatical/dependencyTreeCorrection.py b/ppp_questionparsing_grammatical/dependencyTreeCorrection.py new file mode 100644 index 0000000..0dc0075 --- /dev/null +++ b/ppp_questionparsing_grammatical/dependencyTreeCorrection.py @@ -0,0 +1,23 @@ +def addNamedEntityTag(tree, nameToNodes, words): + """ + If a word v is between 2 words u and w that have the same NER tag, + and v is linked to u or w by a nn relation, + then add the tag of u and w to v + """ + def nnDependent(n1, n2): + return (n1.parent == n2 and n1.dependency == 'nn')\ + or (n2.parent == n1 and n2.dependency == 'nn') + for i in range(1,len(words)-1): + previous = nameToNodes[words[i-1]] + current = nameToNodes[words[i]] + next = nameToNodes[words[i+1]] + if current.namedEntityTag == 'undef' and previous.namedEntityTag != 'undef' and previous.namedEntityTag == next.namedEntityTag: + if nnDependent(previous, current) or nnDependent(next, current): + current.namedEntityTag = previous.namedEntityTag + +def correctTree(tree, nameToNodes, stanfordResult=None): + """ + Correct the tree returned by the Stanford Parser, according to several heuristics. + """ + words = sorted(nameToNodes.keys(), key = lambda x: int(x.split('-')[-1])) + addNamedEntityTag(tree, nameToNodes, words) diff --git a/ppp_questionparsing_grammatical/normalization.py b/ppp_questionparsing_grammatical/normalization.py index b08f10b..38b77c1 100644 --- a/ppp_questionparsing_grammatical/normalization.py +++ b/ppp_questionparsing_grammatical/normalization.py @@ -10,12 +10,12 @@ def buildValue(tree): """ Used to build the values of the normal form. len(tree.getWords()) = 1 -> single value -> return a resource - len(tree.getWords()) > 1 -> alternatives, use a list of resources + len(tree.getWords()) > 1 -> multiple alternatives -> return a list of resources """ if len(tree.getWords()) == 1: - return Resource(value=tree.getWords()[0]) + return Resource(tree.getWords()[0]) else: - return List([Resource(value=x) for x in tree.getWords()]) + return List([Resource(x) for x in tree.getWords()]) def normalizeSuperlative(tree): """ @@ -25,14 +25,14 @@ def normalizeSuperlative(tree): superlative = tree.getWords()[0] if superlative in superlativeNoun: if superlative in superlativeOrder: - return superlativeOrder[superlative](list=Sort(list=normalize(tree.child[0]),predicate=Resource(value=superlativeNoun[superlative]))) + return superlativeOrder[superlative](Sort(normalize(tree.child[0]),Resource(superlativeNoun[superlative]))) else: - return First(list=Sort(list=normalize(tree.child[0]),predicate=Resource(value=superlativeNoun[superlative]))) # First by default + return First(Sort(normalize(tree.child[0]),Resource(superlativeNoun[superlative]))) # First by default else: if superlative in superlativeOrder: - return superlativeOrder[superlative](list=Sort(list=normalize(tree.child[0]),predicate=Resource(value='default'))) # default predicate + return superlativeOrder[superlative](Sort(normalize(tree.child[0]),Resource('default'))) # default predicate else: - return First(list=Sort(list=normalize(tree.child[0]),predicate=Resource(value='default'))) + return First(Sort(normalize(tree.child[0]),Resource('default'))) def normalizeConjunction(tree): """ @@ -47,7 +47,7 @@ def normalizeConjunction(tree): else: result = [normalize(tree.child[1]),normalize(tree.child[0])] try: - return conjunctionTab[conjunction](list=result) + return conjunctionTab[conjunction](result) except KeyError: raise GrammaticalError(conjunction,"conjunction unknown") @@ -58,32 +58,33 @@ def normalize(tree): if tree.child == []: # leaf return buildValue(tree) if tree.child[0].dependency == 'Rexist': - return Exists(list = normalize(tree.child[0])) + return Exists(normalize(tree.child[0])) if tree.child[0].dependency == 'Rspl': # Rspl = superlative, ordinal return normalizeSuperlative(tree) if tree.child[0].dependency.startswith('Rconj'): # Rconj = conjunction return normalizeConjunction(tree) result = [] - for t in tree.child: # R0 ... R5 - assert t.dependency != 'Rspl' and not t.dependency.startswith('Rconj') + for t in tree.child: # R0 ... R7 if t.dependency == 'R0': result.append(normalize(t)) if t.dependency == 'R1': result.append(buildValue(t)) - if t.dependency == 'R2': # ou enlever la condition, ça devient R5 + if t.dependency == 'R2': if len(t.child) == 0: - result.append(Triple(subject=buildValue(t), predicate=buildValue(tree), object=Missing())) + result.append(Triple(buildValue(t),buildValue(tree),Missing())) else: result.append(normalize(t)) if t.dependency == 'R3': - result.append(Triple(subject=Missing(), predicate=buildValue(tree), object=normalize(t))) + result.append(Triple(Missing(),buildValue(tree),normalize(t))) if t.dependency == 'R4': - result.append(Triple(subject=Missing(), predicate=normalize(t), object=buildValue(tree))) - if t.dependency == 'R5' or t.dependency == 'R5s': - result.append(Triple(subject=normalize(t), predicate=buildValue(tree), object=Missing())) - #if t.dependency == 'R6': # not use for the moment - # result.append(Triple(subject=buildValue(tree), predicate=normalize(t), object=Missing())) + result.append(Triple(Missing(),normalize(t),buildValue(tree))) + if t.dependency == 'R5': + result.append(Triple(normalize(t),buildValue(tree),Missing())) + if t.dependency == 'R6': + result.append(Triple(Missing(),Resource('instance of'),normalize(t))) + if t.dependency == 'R7': + result.append(Triple(buildValue(tree),normalize(t),Missing())) if len(result) == 1: return result[0] else: - return Intersection(list=result) + return Intersection(result) diff --git a/ppp_questionparsing_grammatical/nounDB.py b/ppp_questionparsing_grammatical/nounDB.py index e350c12..3d71b88 100644 --- a/ppp_questionparsing_grammatical/nounDB.py +++ b/ppp_questionparsing_grammatical/nounDB.py @@ -9,7 +9,7 @@ def __init__(self): self.verbToNouns = {} def __str__(self): - return '\n'.join(["%s:\t%s" % (x,str(self.verbToNouns[x])) for x in self.verbToNouns.keys()]) + return '\n'.join(["%s:\t%s" % (x,str(self.verbToNouns[x])) for x in sorted(self.verbToNouns.keys())]) def __eq__(self, other): return self.__dict__ == other.__dict__ diff --git a/ppp_questionparsing_grammatical/preprocessing.py b/ppp_questionparsing_grammatical/preprocessing.py index f155a2b..cba9765 100644 --- a/ppp_questionparsing_grammatical/preprocessing.py +++ b/ppp_questionparsing_grammatical/preprocessing.py @@ -1,6 +1,7 @@ import sys from .preprocessingMerge import Word, mergeNamedEntityTag from .data.exceptions import QuotationError +from .dependencyTreeCorrection import correctTree from copy import deepcopy import random import string @@ -246,6 +247,7 @@ def computeTree(r): computeEdges(r,nameToNodes) computeTags(r,nameToNodes) tree = nameToNodes['ROOT-0'] # the tree is built + correctTree(tree, nameToNodes, r) initText(tree,r['text'].replace('"','\\\"')) mergeNamedEntityTag(tree) # NER merging return tree diff --git a/ppp_questionparsing_grammatical/questionWordProcessing.py b/ppp_questionparsing_grammatical/questionWordProcessing.py index 89455bf..85779c0 100644 --- a/ppp_questionparsing_grammatical/questionWordProcessing.py +++ b/ppp_questionparsing_grammatical/questionWordProcessing.py @@ -1,22 +1,35 @@ import sys from .preprocessingMerge import Word from .preprocessing import DependenciesTree -from .data.exceptions import QuestionWordError -from .data.questionWord import closeQuestionWord, openQuestionWord, questionAdd, questionWIs, questionType, questionExcept, existQuestionWord, semiQuestionWord, predicateQuestionWord +from .data.questionWord import closeQuestionWord, openQuestionWord, questionAdd, questionWIs, questionType, questionExcept, existQuestionWord, semiQuestionWord ##################################### # Identify and remove question word # ##################################### +def prepareInstanceOf(t): + """ + Replace by 'inst_of' the dependencies that appears on a path from the root of t to the root of the whole tree + """ + if t.dependency == 'root': + return + else: + t.dependency = 'inst_of' + if t.parent: + prepareInstanceOf(t.parent) + def removeWord(t,word): """ Remove word (of type str*int = s*position_of_s_in_sentence) from tree t - Assume that the node containing word has no child """ assert len(t.wordList) == 1 # no possible alternatives in the tree at this moment if word in t.wordList[0]: - if t.child != []: - raise QuestionWordError(word,"question word has child") + prepareInstanceOf(t) # <<< + if t.child != []: # the question is in the middle of the tree + for u in t.child: + u.dependency = t.dependency + u.parent = t.parent + t.parent.child.append(u) t.parent.child.remove(t) else: for c in t.child: @@ -37,12 +50,12 @@ def firstWords(t,start): def identifyQuestionWord(t): """ - Identify, remove (if open qw) and return the question word. + Identify, remove (if necessary) and return the question word. If there is no question word, return None. """ start = [None,None] firstWords(t,start) - if not start[0]: # the first word is not in the tree, we extract it directly from the sentence + if not start[0]: # the first word is not in the tree, we extract it directly from the sentence start[0] = Word(t.text.split(' ')[0],1) if not start[1]: try: @@ -51,23 +64,18 @@ def identifyQuestionWord(t): pass if start[1]: w = start[0].word.lower() + ' ' + start[1].word.lower() - if w in openQuestionWord: + if w in openQuestionWord or w in semiQuestionWord: removeWord(t,start[0]) removeWord(t,start[1]) return w if w in existQuestionWord: removeWord(t,start[1]) return w - if w in predicateQuestionWord: - return w - if w in semiQuestionWord: - removeWord(t,start[1]) - return w w = start[0].word.lower() - if w in openQuestionWord: + if w in openQuestionWord or w in semiQuestionWord: removeWord(t,start[0]) return w - if w in predicateQuestionWord or w in closeQuestionWord: + if w in closeQuestionWord: return w return None @@ -102,7 +110,8 @@ def checkSub(t,w,excMap=questionExcept): assert len(t.wordList) == 1 and len(t.wordList[0]) == 1 res = True for n in t.child: - res = res and checkSub(n,w) + if n.dependency != 'R6': # don't go through "instance of" edges + res = res and checkSub(n,w) return res else: try: @@ -117,7 +126,8 @@ def checkSubInfo(t,w,excMap=questionExcept): """ res = True for n in t.child: - res = res and checkSub(n,w,excMap) + if n.dependency != 'R6': # don't go through "instance of" edges + res = res and checkSub(n,w,excMap) return res def processQuestionInfo(t,w,excMap=questionExcept,addMap=questionAdd,wisMap=questionWIs): # TO IMPROVE @@ -151,8 +161,5 @@ def processQuestionWord(t,w): processQuestionType(t,w) # type the ROOT according to the question word if w in existQuestionWord: t.child[0].dependency = 'Rexist' - if w in semiQuestionWord or w in predicateQuestionWord: - if len(t.child[0].child) == 1: - t.child[0].child[0].dependency = 'R2' if w in openQuestionWord: processQuestionInfo(t.child[0],w) diff --git a/tests/data_deep.py b/tests/data_deep.py index 611028a..bdf6c5d 100644 --- a/tests/data_deep.py +++ b/tests/data_deep.py @@ -14,12 +14,69 @@ 'Who is the prime minister of France?': T(R('France'), R('prime minister'), M()), + 'Who is Homer J. Simpson?': + T(R('Homer J. Simpson'), R('identity'), M()), + 'Who is the France\'s prime minister?': T(R('France'), R('prime minister'), M()), 'What is the birth date of Bob Marley?': T(R('Bob Marley'), R('birth date'), M()), + 'Who lives in the farm?': + T(M(), R('residence'), R('farm')), + + 'How fast is a cheetah?': + T(R('cheetah'), R('speed'), M()), + + 'How wide is a tennis court?': + T(R('tennis court'), R('width'), M()), + + 'How old is Big Ben?': + T(R('Big Ben'), R('age'), M()), + + 'How tall is Burj Khalifa?': + T(R('Burj Khalifa'), R('height'), M()), + + 'How old is the son of the main actor of "I, Robot"?': + T( + T( + T(R('I, Robot'), R('main actor'), M()), + R('son'), + M() + ), + R('age'), + M() + ), + + 'How fast is the most expensive car in the World?': + T( + L( + S( + T(R('world'), R('car'), M()), + R('cost') + ), + ), + R('speed'), + M() + ), + + 'When was the daughters of the wife of the president of the United States born?': + T( + T( + T( + T(R('United States'), R('president'), M()), + R('wife'), + M() + ), + R('daughter'), + M() + ), + R('birth date'), + M() + ), + + # this question is not correct (see previous question), and so the parsing fails (no subject in the dependency tree, but a dobj). However, it's interesting to be able to handle such questions 'When was born the daughters of the wife of the president of the United States?': T( T( @@ -35,7 +92,26 @@ M() ), - 'Who wrote "Le Petit Prince" and "Vol de Nuit"': + 'Who are the daughters of the wife of the husband of the wife of the president of the United States?': + T( + T( + T( + T( + T(R('United States'), R('president'), M()), + R('wife'), + M() + ), + R('husband'), + M() + ), + R('wife'), + M() + ), + R('daughter'), + M() + ), + + 'Who wrote \"Le Petit Prince\" and \"Vol de Nuit\"': I([ T(R('Le Petit Prince'), R('writer'), M()), T(R('Vol de Nuit'), R('writer'), M()) @@ -69,4 +145,187 @@ 'What is the English for "Звёздные войны. Эпизод VI: Возвращение джедая"?': T(R('Звёздные войны. Эпизод VI: Возвращение джедая'), R('English'), M()), + + 'List movies directed by Spielberg.': + I([ + T(M(), R('instance of'), R('movie')), + T(M(), R('director'), R('Spielberg')) + ]), + + 'Which books were authored by Victor Hugo?': + I([ + T(M(), R('instance of'), R('book')), + T(M(), R('author'), R('Victor Hugo')) + ]), + + 'Which president has been killed by Oswald?': + I([ + T(M(), R('instance of'), R('president')), + T(M(), R('killer'), R('Oswald')) + ]), + + 'Who invented the hula hoop?': + T(R('hula hoop'), R('inventor'), M()), + + 'Who was killed by Oswald?': + T(M(), R('killer'), R('Oswald')), + + 'Which books did Suzanne Collins write?': + I([ + T(M(), R('instance of'), R('book')), + T(M(), R('author'), R('Suzanne Collins')) + ]), + + 'president of France?': + T(R('France'), R('president'), M()), + + 'Give us the queen of England': + T(R('England'), R('queen'), M()), + + 'Who is Babar?': + T(R('Babar'), R('identity'), M()), + + 'What did George Orwell write?': + T(M(), R('author'), R('George Orwell')), + + 'Who has written \"The Hitchhiker\'s Guide to the Galaxy\"?': + T(R('The Hitchhiker\'s Guide to the Galaxy'), R('author'), M()), + + 'When was the president of the United States born': + T( + T(R('United States'), R('president'), M()), + R('birth date'), + M() + ), + + 'From which country is Alan Turing?': + I([ + T(M(), R('instance of'), R('country')), + T(R('Alan Turing'), R('country of citizenship'), M()) + ]), + + 'In which countries is the Lake Victoria?': + I([ + T(M(), R('instance of'), R('country')), + T(R('Lake Victoria'), R('country'), M()) + ]), + + 'What actor married John F. Kennedy\'s sister?': + I([ + T(M(), R('instance of'), R('actor')), + T( + T(R('John F. Kennedy'), R('sister'), M()), + R('husband'), + M() + ) + ]), + + 'Who is J. F. Kennedy?': + T(R('J. F. Kennedy'), R('identity'), M()), + + 'Who is J. F. K.?': + T(R('J. F. K.'), R('identity'), M()), + + 'Where was Ulysses S. Grant born?': + T(R('Ulysses S. Grant'), R('birth place'), M()), + + 'Who is the US president?': + T(R('US'), R('president'), M()), + + 'Who is the United States president?': + T(R('United States'), R('president'), M()), + + 'What is a chocolate sunday?': + T(R('chocolate sunday'), R('definition'), M()), + + 'What is the D Day?': + T(R('d Day'), R('definition'), M()), + + 'What is the natural language processing?': + T(R('natural language processing'), R('definition'), M()), + + 'Where is Inoco based?': + T(R('inoco'), R('location'), M()), + + 'Who is the author of \"Le Petit Prince\"?': + T(R('Le Petit Prince'), R('author'), M()), + + 'Who are the Beatles\' members?': + T(R('beatles'), R('member'), M()), + + 'What is the biggest country in South America?': + L( + S( + T(R('South America'), R('country'), M()), + R('size') + ) + ), + + 'Who is the author of \"Animal Farm\" and \"1984\"?': + I([ + T(R('1984'), R('author'), M()), + T(R('Animal Farm'), R('author'), M()) + ]), + + 'Who was Darth Vader’s son?': + T(R('Darth Vader'), R('son'), M()), + + 'What was the monetary value of the Nobel Peace Prize in 1989?': + T( + T(R('1989'), R('Nobel Peace Prize'), M()), + R('monetary value'), + M() + ), + + 'What is the continent of Fiji and Guam?': + I([ + T(R('Fiji'), R('continent'), M()), + T(R('Guam'), R('continent'), M()) + ]), + + 'Who is the first president of France?': + F( + S( + T(R('France'), R('president'), M()), + R('default') + ) + ), + + 'What is the most expensive car in the world?': + L( + S( + T(R('world'), R('car'), M()), + R('cost') + ) + ), + + 'Give the capital of France': + T(R('France'), R('capital'), M()), + + 'Is there a king of England?': + E( + T(R('England'), R('king'), M()) + ), + + 'What is the highest mountain of Tanzania?': + L( + S( + T(R('Tanzania'), R('mountain'), M()), + R('height') + ) + ), + + 'What is the coldest place on earth?': + F( + S( + T(R('earth'), R('place'), M()), + R('temperature') + ) + ), + + 'Who developed Microsoft?': + T(R('Microsoft'), R('developer'), M()), + + 'Give me all companies in Munich': + T(R('Munich'), R('company'), M()), } diff --git a/tests/test_dependencytree.py b/tests/test_dependencytree.py index 92c2726..06b38cd 100644 --- a/tests/test_dependencytree.py +++ b/tests/test_dependencytree.py @@ -1,6 +1,6 @@ import json from nltk.stem.wordnet import WordNetLemmatizer -from ppp_questionparsing_grammatical import Word, QuotationHandler, DependenciesTree, computeTree, mergeNamedEntityTagChildParent, mergeNamedEntityTagSisterBrother, QuotationError, NounificationError +from ppp_questionparsing_grammatical import Word, correctTree, QuotationHandler, DependenciesTree, computeTree, mergeNamedEntityTagChildParent, mergeNamedEntityTagSisterBrother, QuotationError, NounificationError import data from unittest import TestCase @@ -14,6 +14,27 @@ def testBasicWordConstructor(self): self.assertEqual(w.pos,'bar') self.assertEqual(str(w),"(foo,1,bar)") + def testAddNamedEntityTag1(self): + foo1 = DependenciesTree('foo1', 1, namedEntityTag='42') + foo2 = DependenciesTree('foo2', 3, namedEntityTag='42') + bar = DependenciesTree('bar', 2, namedEntityTag='undef', dependency = 'nn', parent = foo1) + correctTree(foo1, {'foo1-1' : foo1, 'bar-2' : bar, 'foo2-3' : foo2}) + self.assertEqual(bar.namedEntityTag, '42') + + def testAddNamedEntityTag2(self): + foo1 = DependenciesTree('foo1', 1, namedEntityTag='42') + foo2 = DependenciesTree('foo2', 3, namedEntityTag='42') + bar = DependenciesTree('bar', 2, namedEntityTag='27', dependency = 'nn', parent = foo1) + correctTree(foo1, {'foo1-1' : foo1, 'bar-2' : bar, 'foo2-3' : foo2}) + self.assertEqual(bar.namedEntityTag, '27') + + def testAddNamedEntityTag3(self): + foo1 = DependenciesTree('foo1', 1, namedEntityTag='42') + foo2 = DependenciesTree('foo2', 3, namedEntityTag='42') + bar = DependenciesTree('bar', 2, namedEntityTag='undef', dependency = 'amod', parent = foo1) + correctTree(foo1, {'foo1-1' : foo1, 'bar-2' : bar, 'foo2-3' : foo2}) + self.assertEqual(bar.namedEntityTag, 'undef') + def testBasicQuotationHandler(self): handler = QuotationHandler("foo") sentence = "The person who sing \"Let It Be\" and \"Lucy in the Sky with Diamonds\" also sing \"Yellow Submarine\"." @@ -211,7 +232,7 @@ def testEntityTagMerge1(self): self.assertEqual(len(the.child),0) self.assertEqual(the.subtreeType,'undef') self.assertEqual(the.dfsTag,0) - + def testEntityTagMerge2(self): tree=computeTree(data.give_obama_president_usa()['sentences'][0]) tree.sort() diff --git a/tests/test_hierarchy.py b/tests/test_hierarchy.py index 97da7b1..fca3c40 100644 --- a/tests/test_hierarchy.py +++ b/tests/test_hierarchy.py @@ -103,13 +103,22 @@ def testHierarchySimplification2(self): self.assertEqual(is_.dfsTag,0) # President president=is_.child[0] - self.assertEqual(president.wordList, [[Word("United",4,'NNP'),Word("States",5,'NNPS'),Word("president",6,'NN')]]) + self.assertEqual(president.wordList, [[Word("president",6,'NN')]]) self.assertEqual(president.namedEntityTag,'undef') self.assertEqual(president.dependency,'R2') self.assertEqual(president.parent,is_) - self.assertEqual(len(president.child),0) + self.assertEqual(len(president.child),1) self.assertEqual(president.subtreeType,'PERSON') self.assertEqual(president.dfsTag,0) + # US + us=president.child[0] + self.assertEqual(us.wordList, [[Word("United",4,'NNP'),Word("States",5,'NNPS')]]) + self.assertEqual(us.namedEntityTag,'LOCATION') + self.assertEqual(us.dependency,'R5') + self.assertEqual(us.parent,president) + self.assertEqual(len(us.child),0) + self.assertEqual(us.subtreeType,'undef') + self.assertEqual(us.dfsTag,0) def testHierarchyConnectors1(self): tree=computeTree(data.give_opera()['sentences'][0]) @@ -168,16 +177,25 @@ def testHierarchyConnectors1(self): self.assertEqual(gilbert.parent,first1) self.assertEqual(len(gilbert.child),0) self.assertEqual(gilbert.subtreeType,'undef') - self.assertEqual(gilbert.dfsTag,0) + self.assertEqual(gilbert.dfsTag,0) + # opera + opera=first2.child[0] + self.assertEqual(opera.wordList,[[Word("opera",8,'NN')]]) + self.assertEqual(opera.namedEntityTag,'undef') + self.assertEqual(opera.dependency,'Rspl') + self.assertEqual(opera.parent,first2) + self.assertEqual(len(opera.child),1) + self.assertEqual(opera.subtreeType,'undef') + self.assertEqual(opera.dfsTag,0) # sullivan - sullivan=first2.child[0] - self.assertEqual(sullivan.wordList,[[Word("Sullivan",7,'NNP'),Word("opera",8,'NN')]]) - self.assertEqual(sullivan.namedEntityTag,'undef') - self.assertEqual(sullivan.dependency,'Rspl') - self.assertEqual(sullivan.parent,first2) + sullivan=opera.child[0] + self.assertEqual(sullivan.wordList,[[Word("Sullivan",7,'NNP')]]) + self.assertEqual(sullivan.namedEntityTag,'PERSON') + self.assertEqual(sullivan.dependency,'R5') + self.assertEqual(sullivan.parent,opera) self.assertEqual(len(sullivan.child),0) self.assertEqual(sullivan.subtreeType,'undef') - self.assertEqual(sullivan.dfsTag,0) + self.assertEqual(sullivan.dfsTag,0) def testHierarchyConnectors2(self): tree=computeTree(data.give_chief()['sentences'][0]) @@ -264,7 +282,7 @@ def testYesNoQuestion(self): date=birth.child[0] self.assertEqual(date.wordList,[[Word("1900",4,'CD')]]) self.assertEqual(date.namedEntityTag,'DATE') - self.assertEqual(date.dependency,'R5') + self.assertEqual(date.dependency,'R3') self.assertEqual(date.parent,birth) self.assertEqual(len(date.child),0) self.assertEqual(date.subtreeType,'undef') diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 4b3e356..3da7e04 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -55,27 +55,37 @@ def testSuperlativeNormalize(self): "list": [ { "list": { + "predicate": { + "value": "default", + "type": "resource" + }, "list": { "value": "Gilbert", "type": "resource" }, - "predicate": { - "value" : "default", - "type" : "resource" - }, "type": "sort" }, "type": "first" }, { "list": { - "list": { - "value": "Sullivan opera", + "predicate": { + "value": "default", "type": "resource" }, - "predicate": { - "value" : "default", - "type" : "resource" + "list": { + "predicate": { + "value": "opera", + "type": "resource" + }, + "object": { + "type": "missing" + }, + "subject": { + "value": "Sullivan", + "type": "resource" + }, + "type": "triple" }, "type": "sort" }, @@ -83,8 +93,7 @@ def testSuperlativeNormalize(self): } ], "type": "intersection" -} -) +}) def testNormalize1(self): tree = computeTree(data.give_president_of_USA()['sentences'][0]) @@ -174,31 +183,41 @@ def testNormalize3(self): "type": "intersection", "list": [ { - "object": { - "type": "missing" - }, "type": "triple", "predicate": { "value": "identity", "type": "resource" }, + "object": { + "type": "missing" + }, "subject": { "value": "Obama", "type": "resource" } }, { - "object": { - "value": "United States president", - "type": "resource" - }, "type": "triple", "predicate": { "value": "identity", "type": "resource" }, - "subject": { + "object": { "type": "missing" + }, + "subject": { + "type": "triple", + "predicate": { + "value": "president", + "type": "resource" + }, + "subject": { + "value": "United States", + "type": "resource" + }, + "object": { + "type": "missing" + } } } ] @@ -230,6 +249,10 @@ def testNormalizeR8(self): { "type": "resource", "value": "residence" + }, + { + "type": "resource", + "value": "country" } ] }, @@ -382,17 +405,17 @@ def testSemiQuestionWord2(self): qw = simplify(tree) result = normalize(tree) self.assertEqual(result,{ - "subject": { - "type": "resource", - "value": "US president" - }, - "type": "triple", "object": { "type": "missing" }, "predicate": { - "type": "resource", - "value": "list" + "value": "president", + "type": "resource" + }, + "type": "triple", + "subject": { + "value": "US", + "type": "resource" } })