From 06c4508a9c6fd35c47cce34f87abe524e115fb72 Mon Sep 17 00:00:00 2001 From: jonhusson Date: Fri, 16 Dec 2016 16:33:17 -0800 Subject: [PATCH] intial commit --- .gitignore | 4 + README.md | 91 +++++++++ config | 17 ++ credentials.example | 6 + extractions/SQL.txt | 31 +++ input/url.txt | 1 + makefile | 8 + output/.gitignore | 2 + requirements.txt | 6 + run.py | 78 ++++++++ setup/setup.sh | 53 +++++ udf/buildbib.py | 104 ++++++++++ udf/ext_age_check.py | 254 ++++++++++++++++++++++++ udf/ext_references.py | 219 +++++++++++++++++++++ udf/ext_results.py | 170 ++++++++++++++++ udf/ext_strat_mentions.py | 264 +++++++++++++++++++++++++ udf/ext_strat_phrases.py | 336 ++++++++++++++++++++++++++++++++ udf/ext_strat_target.py | 269 +++++++++++++++++++++++++ udf/ext_strat_target_distant.py | 326 +++++++++++++++++++++++++++++++ udf/ext_target.py | 166 ++++++++++++++++ udf/ext_target_adjective.py | 100 ++++++++++ udf/initdb.py | 194 ++++++++++++++++++ var/strat_variables.txt | 19 ++ var/target_variables.txt | 11 ++ 24 files changed, 2729 insertions(+) create mode 100755 .gitignore create mode 100755 README.md create mode 100755 config create mode 100755 credentials.example create mode 100755 extractions/SQL.txt create mode 100755 input/url.txt create mode 100755 makefile create mode 100755 output/.gitignore create mode 100755 requirements.txt create mode 100755 run.py create mode 100755 setup/setup.sh create mode 100755 udf/buildbib.py create mode 100755 udf/ext_age_check.py create mode 100755 udf/ext_references.py create mode 100755 udf/ext_results.py create mode 100755 udf/ext_strat_mentions.py create mode 100755 udf/ext_strat_phrases.py create mode 100755 udf/ext_strat_target.py create mode 100755 udf/ext_strat_target_distant.py create mode 100755 udf/ext_target.py create mode 100755 udf/ext_target_adjective.py create mode 100755 udf/initdb.py create mode 100755 var/strat_variables.txt create mode 100755 var/target_variables.txt diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..2cbe8df --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.DS_Store + +credentials +*.swp \ No newline at end of file diff --git a/README.md b/README.md new file mode 100755 index 0000000..96a8188 --- /dev/null +++ b/README.md @@ -0,0 +1,91 @@ +# GeoDeepDive Application Template +A template for building applications for [GeoDeepDive](https://geodeepdive.org) + +## Getting started +Dependencies: + + [GNU Make](https://www.gnu.org/software/make/) + + [git](https://git-scm.com/) + + [pip](https://pypi.python.org/pypi/pip) + + [PostgreSQL](http://www.postgresql.org/) + +### OS X +OS X ships with GNU Make, `git`, and Python, but you will need to install `pip` and PostgreSQL. + +To install `pip`: +```` +sudo easy_install pip +```` + +To install PostgreSQL, it is recommended that you use [Postgres.app](http://postgresapp.com/). Download +the most recent version, and be sure to follow [the instructions](http://postgresapp.com/documentation/cli-tools.html) +for setting up the command line tools, primarily adding the following line to your `~/.bash_profile`: + +```` +export PATH=$PATH:/Applications/Postgres.app/Contents/Versions/latest/bin +```` + + +### Setting up the project +First, clone this repository and run the setup script: + +```` +git clone https://github.com/UW-DeepDiveInfrastructure/app-template +cd app-template +make +```` + +Edit `credentials` with the connection credentials for your local Postgres database. + +To create a database with the data included in `/setup/usgs_example`: + +```` +make local_setup +```` + +To run an example, run `python run.py`. + +## Running on GeoDeepDive Infrastructure +All applications are required to have the same structure as this repository, namely an empty folder named `output`, a valid +`config` file, an updated `requirements.txt` describing any Python dependencies, and `run.py` which runs the application +and outputs results. The `credentials` file will be ignored and substituted with a unique version at run time. + +The GeoDeepDive infrastructure will have the following software available: + + Python 2.7+ (Python 3.x not supported at this time) + + PostgreSQL 9.4+, including command line tools and PostGIS + +#### Submitting a config file +The `config` file outlines a list of terms OR dictionaries that you are interested in culling from the corpus. Once you have +updated this file, a private repository will be set up for you under the UW-DeepDiveInfrastructure Github group for you to +push the code from this repository to. Your `config` file will be used to generate a custom testing subset of documents that +you can use to develop your application. + +#### Running the application +Once you have developed your application and tested it against the corpus subset, simply push your application to the +private repository created in the previous step. The application will then be run according to the parameters set in the +`config` file. + +#### Getting results +After the application is run, the contents of the `output` folder will be gzipped and be made available to download. If +an error was encountered or your application did not run successfully any errors thrown will be logged into the file +`errors.txt` which is included in the gzipped results package. + +## File Summary + +#### config +A YAML file that contains project settings. + + +#### credentials +A YAML file that contains local postgres credentials for testing and generating examples. + + +#### requirements.txt +List of Python dependencies to be installed by `pip` + + +#### run.py +Python script that runs the entire application, including any setup tasks and exporting of results to the folder `/output`. + + +## License +CC-BY 4.0 International diff --git a/config b/config new file mode 100755 index 0000000..fe1a90a --- /dev/null +++ b/config @@ -0,0 +1,17 @@ +# The name of the application (no spaces) +app_name: strom + +# First and last name of the user +user: Jon Husson + +# The NLP product to run the application against +product: NLP352 + +# How often the application should be run +frequency: monthly + +# A list of terms used to subset the corpus +terms: [stromatolite, stromatolitic, thrombolite, thrombolitic] + +# Stored dictionary of terms, to be set by GDD infrastructure admins +dictionary: strom diff --git a/credentials.example b/credentials.example new file mode 100755 index 0000000..82178b7 --- /dev/null +++ b/credentials.example @@ -0,0 +1,6 @@ +postgres: + user: postgres_username + port: 5432 + host: localhost + database: deepdive_app + password: password123 diff --git a/extractions/SQL.txt b/extractions/SQL.txt new file mode 100755 index 0000000..5b17172 --- /dev/null +++ b/extractions/SQL.txt @@ -0,0 +1,31 @@ +#============================================================================== +# PG DUMP FOR RESULTS +#============================================================================== + +pg_dump -t results -t strat_target -t strat_target_distant -t age_check -t bib -t target_adjectives DBNAME > ./output/output.sql + +#============================================================================== +# CREATE (ALREADY PRESENT) DATABASE FROM DUMP +#============================================================================== + +psql -d DBNAME -f ../output/output.sql + +#============================================================================== +# USEFUL SQL QUERIES FOR SUMMARY RESULTS +#============================================================================== + +COPY(SELECT strat_phrase_root,strat_name_id, COUNT(strat_name_id) + FROM results + WHERE (strat_name_id<>'0' AND target_word ILIKE '%stromato%') + GROUP BY strat_phrase_root, strat_name_id) + TO '/Users/jhusson/Box Sync/postdoc/deepdive/stroms/V2/test.csv' DELIMITER ',' CSV HEADER; + +#============================================================================== +# INTERESTING STROMATOLITE ADJECTIVES +#============================================================================== + +SELECT * from target_adjectives WHERE target_adjective ILIKE 'domal' OR +target_adjective ILIKE 'columnar' OR +target_adjective ILIKE 'conical' OR +target_adjective ILIKE 'domical' OR +target_adjective ILIKE 'domed' \ No newline at end of file diff --git a/input/url.txt b/input/url.txt new file mode 100755 index 0000000..e9231e7 --- /dev/null +++ b/input/url.txt @@ -0,0 +1 @@ +deepdivesubmit.chtc.wisc.edu/static/strom_nlp_27Jan2016.zip diff --git a/makefile b/makefile new file mode 100755 index 0000000..ff24788 --- /dev/null +++ b/makefile @@ -0,0 +1,8 @@ +all: + cp credentials.example credentials; + pip install -r requirements.txt; + + + +local_setup: + ./setup/setup.sh diff --git a/output/.gitignore b/output/.gitignore new file mode 100755 index 0000000..d6b7ef3 --- /dev/null +++ b/output/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..1070da6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +psycopg2>=2.6.1 +pyyaml>=3.11 +tqdm>=1.0 +stop-words>=2015.2.23.1 +docopt>=0.6.1 +numpy>=1.9.2 \ No newline at end of file diff --git a/run.py b/run.py new file mode 100755 index 0000000..ed3c80d --- /dev/null +++ b/run.py @@ -0,0 +1,78 @@ +#============================================================================== +#RUN ALL - STROMATOLITES +#============================================================================== + +#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites + +#============================================================================== + +import os, time, subprocess, yaml + +#tic +start_time = time.time() + +#load configuration file +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +#load credentials file +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + + +#ensure working directory is proper +#os.chdir("/Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites") + +#INITALIZE THE POSTGRES TABLES +print 'Step 1: Initialize the PSQL tables ...' +subprocess.call('./setup/setup.sh', shell=True) +os.system('python ./udf/initdb.py') + +#BUILD THE BIBLIOGRAPHY +print 'Step 2: Build the bibliography ...' +os.system('python ./udf/buildbib.py') + +#FIND TARGET INSTANCES +print 'Step 3: Find stromatolite instances ...' +os.system('python ./udf/ext_target.py') + +#FIND STRATIGRAPHIC ENTITIES +print 'Step 4: Find stratigraphic entities ...' +os.system('python ./udf/ext_strat_phrases.py') + +#FIND STRATIGRAPHIC MENTIONS +print 'Step 5: Find stratigraphic mentions ...' +os.system('python ./udf/ext_strat_mentions.py') + +#CHECK AGE - UNIT MATCH AGREEMENT +print 'Step 6: Check age - unit match agreement ...' +os.system('python ./udf/ext_age_check.py') + +#DEFINE RELATIONSHIPS BETWEEN TARGET AND STRATIGRAPHIC NAMES +print 'Step 7: Define the relationships between stromatolite phrases and stratigraphic entities/mentions ...' +os.system('python ./udf/ext_strat_target.py') + +#DEFINE RELATIONSHIPS BETWEEN TARGET AND DISTANT STRATIGRAPHIC NAMES +print 'Step 8: Define the relationships between stromatolite phrases and distant stratigraphic entities/mentions ...' +os.system('python ./udf/ext_strat_target_distant.py') + +#DEFINE RELATIONSHIPS BETWEEN TARGET AND DISTANT STRATIGRAPHIC NAMES +print 'Step 9: Delineate reference section from main body extractions ...' +os.system('python ./udf/ext_references.py') + +#BUILD A BEST RESULTS TABLE OF STROM-STRAT_NAME TUPLES +print 'Step 10: Build a best results table of strom-strat_name tuples ...' +os.system('python ./udf/ext_results.py') + +#FIND ADJECTIVES DESCRIBING STROM +print 'Step 11: Find adjectives describing strom target words ...' +os.system('python ./udf/ext_target_adjective.py') + +#POSTGRES DUMP +print 'Step 12: Dump select results from PSQL ...' +output = 'pg_dump -U '+ credentials['postgres']['user'] + ' -t results -t strat_target -t strat_target_distant -t age_check -t refs_location -t bib -t target_adjectives -d ' + credentials['postgres']['database'] + ' > ./output/output.sql' +subprocess.call(output, shell=True) + +#summary of performance time +elapsed_time = time.time() - start_time +print '\n ###########\n\n elapsed time: %d seconds\n\n ###########\n\n' %(elapsed_time) diff --git a/setup/setup.sh b/setup/setup.sh new file mode 100755 index 0000000..f388a94 --- /dev/null +++ b/setup/setup.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# via http://stackoverflow.com/a/21189044/1956065 +function parse_yaml { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i length of VARIABLE == number of fields + #--> length of VARIABLE[i] == number of rows + #--> VARIABLE[i][0] = header name + cols = list(zip(*dump)) + + #key names correspond to field names (headers in the CSV file) + for field in cols: + dump_dict[field[0]]=field[1:] + + dump_dict['headers'] = sorted(dump_dict.keys()) + + return dump_dict + + +#Connect to Postgres +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) +cursor = connection.cursor() + + +#initalize the age_check table +cursor.execute(""" + DELETE FROM age_check; +""") +connection.commit() + +#initialize the age_agree column in strat_phrases +cursor.execute(""" + UPDATE strat_phrases + SET age_agree = '-'; +""") +connection.commit() + + +#strat_phrases data dump +cursor.execute(""" + SELECT DISTINCT ON(strat_name_id, int_name) + + strat_phrase_root, + strat_flag, + strat_name_id, + int_name, + int_id + + FROM strat_phrases + + WHERE strat_name_id<>'0' + AND int_name<>'na' + +""") + + +#convert list of tuples to list of lists +int_list=cursor.fetchall() +int_list = [list(elem) for elem in int_list] + +#gather list of unique strat_name_ids +strat_name_ids = set([i[2] for i in int_list]) + +#define overlap buffer between unit_ages and interval_ages +age_buff=25 + +#initialize summary variables +huh=0 +yay=0 +nay=0 +int_check=[] + +#loop through all unique strat_name ids to check extracted age - unit link congruency +for idx, name in enumerate(strat_name_ids): + + #find all strat_name_list - interval tuples to be checked + name_check = [j for j in int_list if j[2]==name] + + #split into individual strat_name_ids by user-defined deilimiter + strat_name_id = name.split('~') + + #initiliaze variables for checking + unit_ages=[] + skip=0 + + #loop through each individual strat_name_id + for match in strat_name_id: + #hit the api to find unit_matches through /units route + unit_link = download_csv( 'https://macrostrat.org/api/units?format=csv&strat_name_id=' + match ) + + #if matches found, define b_age and t_age for a given strat_name from the constituent units + if unit_link['headers']: + unit_ages.append([max(float(x) for x in unit_link['b_age']), min(float(x) for x in unit_link['t_age'])]) + else: + unit_ages.append('na') + skip+=1 + + #loop through each individual strat_name_list - interval tuple + for idx2,row in enumerate(name_check): + + #initiliaze variables for checking + age_check=[] + int_id=row[4] + age_def=row[3] + int_age=[] + + #case if interval information is a known interval name (AND at least one strat_name_id has a match) + if int_id!=0 and len(strat_name_id)!=skip: + int_def = download_csv( 'https://macrostrat.org/api/defs/intervals?format=csv&int_id=' + str(int_id) ) + int_age = [float(int_def['b_age'][0]), float(int_def['t_age'][0])] + + #case if interval information is a numeric age (AND at least one strat_name_id has a match) + elif len(strat_name_id)!=skip: + age_def=age_def.split(' ') + + #if units are Gyr + if age_def[1].lower() in 'ga': + try: + age=float(age_def[0])*1000 + int_age = [abs(age), abs(age)] + + except ValueError: + age='na' + + #if units are Myr + else: + try: + age=float(age_def[0]) + int_age = [abs(age), abs(age)] + + except: + age='na' + + #compare each individual strat_name age range to the interval information + for unit in unit_ages: + + #case if unit or interval information not recovered + if unit=='na' or not int_age: + age_check.append('NA') + huh+=1 + + #case if unit and interval ages do not cross + elif unit[0]+age_buffint_age[0]+age_buff: + age_check.append('no') + nay+=1 + + #case if they do + else: + age_check.append('yes') + yay+=1 + + #summarize the findings for all strat_name_ids + name_check[idx2].extend(['~'.join(age_check)]) + + + + #dump to a local variable + [int_check.append(j) for j in name_check] + +#write to PSQL table +for idx,i in enumerate(int_check): + strat_phrase_root, strat_flag, strat_name_id, int_name, int_id, age_agree = i + + cursor.execute(""" + INSERT INTO age_check( strat_phrase_root, + strat_flag, + strat_name_id, + int_name, + int_id, + age_agree) + VALUES (%s, %s, %s, %s, %s, %s);""", + (strat_phrase_root, strat_flag, strat_name_id, int_name, int_id, age_agree) + ) + + +#push insertions +connection.commit() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE age_check; +""") +connection.commit() + + +#splice strat_name_id-age tuples into the strat_phrases table +cursor.execute(""" UPDATE strat_phrases + + SET age_agree = age_check.age_agree + FROM age_check + WHERE strat_phrases.strat_name_id = age_check.strat_name_id + AND strat_phrases.int_name = age_check.int_name + +""") +connection.commit() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE strat_phrases; +""") +connection.commit() + +#close the connection +connection.close() + + + +#summary statistic +success = 'SUMMARY OF AGE CHECKS: yays = %s; nays = %s; unknown = %s' %(yay, nay, huh) + +#summary of performance time +elapsed_time = time.time() - start_time +print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time) + + + + + + diff --git a/udf/ext_references.py b/udf/ext_references.py new file mode 100755 index 0000000..62e739a --- /dev/null +++ b/udf/ext_references.py @@ -0,0 +1,219 @@ +#============================================================================== +#DEFINE BEGINNING OF REFERENCES SECTION +#============================================================================== + +#============================================================================== +# ACQUIRE RELEVANT MODULES and DATA +#============================================================================== + +import time, psycopg2, yaml +import numpy as np + +from psycopg2.extensions import AsIs + +#tic +start_time = time.time() + +#Credentials and configuration +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) + +#make some cursors for writing/reading from Postgres +cursor = connection.cursor() +doc_cursor=connection.cursor() +sent_cursor = connection.cursor() + + +#============================================================================== +# FIND REFERENCE SECTIONS +#============================================================================== + +#list of unique docids from target-strat tuples +doc_cursor.execute(""" + SELECT docid FROM strat_target + UNION + SELECT docid FROM strat_target_distant +""") + +#initialize Numpy arrays +refs=np.zeros(0,dtype={'names':['docid','sentid','type','depth'],'formats':['|S100','i4','|S100','f4']}) +best_refs=np.zeros(0,dtype={'names':['docid','sentid','type','depth'],'formats':['|S100','i4','|S100','f4']}) + +#loop through documents list +for idx, doc in enumerate(doc_cursor): + #array for reference section for this document + tmp_refs=np.zeros(0,dtype={'names':['docid','sentid','type','depth'],'formats':['|S100','i4','|S100','f4']}) + + #collect all sentences for this document + sent_cursor.execute(""" + SELECT docid, sentid, words from %(my_app)s_sentences_%(my_product)s + WHERE docid=%(my_docid)s;""", + { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()), + "my_docid": doc[0], + }) + + #loop through sentences + for idx2, sent in enumerate(sent_cursor): + docid,sentid,words = sent + phrase = ' '.join(words) + + #REF ID LOGIC: is the first word in a sentence 'References'? + if words[0]=='References' or words[0]=='REFERENCES': + tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ref',0)],dtype=tmp_refs.dtype)) + + #REF ID LOGIC: is the first word in a sentence 'Bibliography'? + if words[0]=='Bibliography' or words[0]=='BIBLIOGRAPHY': + tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ref',0)],dtype=tmp_refs.dtype)) + + #REF ID LOGIC: is the first word in a sentence French for 'Bibliography'? + if words[0]=='Bibliographie' or words[0]=='BIBLIOGRAPHIE': + tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ref',0)],dtype=tmp_refs.dtype)) + + #REF ID LOGIC: is there an all capitalized 'REFERENCES' in words array? + if 'REFERENCES' in words: + tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ref_mention',0)],dtype=tmp_refs.dtype)) + + #REF ID LOGIC: is the word 'Acknowledgements' in words array? + if 'Acknowledgements' in words or 'Acknowledgments' in words or 'ACKNOWLEDGEMENTS' in words or 'ACKNOWLEDGMENTS' in words: + tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ack',0)],dtype=tmp_refs.dtype)) + + #null case where no reference section is identified + if len(tmp_refs)==0: + tmp_refs = np.array([(docid,0,'none',0)],dtype=tmp_refs.dtype) + + #parameter characterizing how deep the reference section is (ref sent #)/(total sent #) + tmp_refs['depth']=tmp_refs['sentid']/(idx2+1.) + + #all potential reference breaks + refs = np.append(refs,tmp_refs) + + #'Best' reference break is the deepest sentid + tmp_refs=np.sort(tmp_refs,order='sentid') + best_refs = np.append(best_refs,tmp_refs[-1]) + + +#arbitrary cutoff for 'good' inferences - reset those below threshold to null case +best_refs['sentid'][best_refs['depth']<0.1]=0 +best_refs['type'][best_refs['depth']<0.1]='none' +best_refs['depth'][best_refs['depth']<0.1]=0.0 + +zeros = best_refs[best_refs['sentid']==0] + + +#============================================================================== +# PUSH REFERENCE FINDINGS TO POSTGRES +#============================================================================== + +#Make a new table +cursor.execute(""" + DROP TABLE IF EXISTS refs_location CASCADE; + CREATE TABLE refs_location( + docid text, + sentid int, + type text, + depth real); +""") +connection.commit() + +#loop through best reference ids and push to Postgres +for row in best_refs: + cursor.execute(""" + INSERT INTO refs_location( docid, + sentid, + type, + depth) + VALUES (%s, %s, %s, %s);""", + (row['docid'],str(row['sentid']),row['type'],str(row['depth'])) + ) + + +#Join reference locations to target-strat tuples +cursor.execute(""" UPDATE strat_target + SET refs_loc = refs_location.sentid + FROM refs_location + WHERE strat_target.docid = refs_location.docid + +""") + +#Join reference locations to target-strat_distant tuples +cursor.execute(""" UPDATE strat_target_distant + SET refs_loc = refs_location.sentid + FROM refs_location + WHERE strat_target_distant.docid = refs_location.docid +""") + +#Add 'in references'/'out of references' inference to target-strat tuples +cursor.execute(""" UPDATE strat_target + SET in_ref = 'yes' + WHERE sentid > refs_loc + AND refs_loc <>0 + +""") + +#Add 'in references'/'out of references' inference to target-strat_distant tuples +cursor.execute(""" UPDATE strat_target_distant + SET in_ref = 'yes' + WHERE sentid > refs_loc + AND refs_loc <>0 + +""") + +#push changes +connection.commit() + +#close the postgres connection +connection.close() + +elapsed_time = time.time() - start_time + + +#%% FOR DEBUGGING + +#tmp_refs=best_refs[(best_refs['sentid']!=0)] +# +#tmp = tmp_refs[np.random.choice(len(tmp_refs), 1)] +# +#my_sentid= np.arange(tmp['sentid']-4,tmp['sentid']+20) +# +#sent_cursor.execute(""" +# SELECT docid, sentid, words from %(my_app)s_sentences_%(my_product)s +# WHERE docid=%(my_docid)s +# AND sentid = ANY(%(my_sentid)s) +# ORDER BY sentid;""", +# { +# "my_app": AsIs(config['app_name']), +# "my_product": AsIs(config['product'].lower()), +# "my_docid": tmp['docid'][0], +# "my_sentid": (list(my_sentid),) +# }) +# +#phrase='' +#for idx2, sent in enumerate(sent_cursor): +# docid,sentid,words = sent +# words = ' '.join(words) +# +# if sentid==tmp['sentid']: +# flag=words +# phrase = phrase+'\n***** '+words +# else: +# phrase = phrase+'\n-'+words +## print words +# +## if sentid==tmp['sentid']: +# +# +#print '\n ###########\n\n %s \n\n ###########\n\n %s \n\n ###########\n\n' %(phrase,flag) + diff --git a/udf/ext_results.py b/udf/ext_results.py new file mode 100755 index 0000000..87e7ef4 --- /dev/null +++ b/udf/ext_results.py @@ -0,0 +1,170 @@ +#============================================================================== +#GENERATE RESULTS TABLE +#============================================================================== + +import time, random, re, yaml, psycopg2, copy +from psycopg2.extensions import AsIs + +start_time = time.time() + +# Connect to Postgres +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) +cursor = connection.cursor() + + +#NEW RESULTS TABLE +cursor.execute(""" + DROP TABLE IF EXISTS results CASCADE; + CREATE TABLE results( + target_id int, + docid text, + sentid int, + target_word text, + strat_phrase_root text, + strat_flag text, + strat_name_id text, + age_sum text, + source text, + phrase text, + is_strat_name text DEFAULT 'yes', + in_ref text + ); +""") +connection.commit() + +#TMP RESULTS TABLE +cursor.execute(""" + DROP TABLE IF EXISTS results_new; +""") + +#push drop/create to the database +connection.commit() + +#gather results from the same-sentence inferences +cursor.execute(""" + INSERT INTO results (target_id, docid, sentid, target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, phrase, in_ref) + (SELECT target_id, docid, sentid, target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, sentence, in_ref + FROM strat_target + WHERE ((num_phrase=1 AND @(target_distance)<51) + OR (target_relation='parent' AND num_phrase <8 AND @(target_distance)<51) + OR (target_relation='child' AND num_phrase <8 AND @(target_distance)<51)))""" +) + +#push insertions +connection.commit() + +#mark these inferences as coming from same sentence +cursor.execute(""" + UPDATE results SET source='in_sent' WHERE source IS NULL + """ +) + +#push update +connection.commit() + +#gather results from the near-sentence inferences +cursor.execute(""" + INSERT INTO results (target_id, docid, sentid, target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, phrase, in_ref) + (SELECT target_id, docid, sentid, target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, words_between, in_ref + FROM strat_target_distant + WHERE num_phrase=1)""" +) + +#push insertions +connection.commit() + +#mark these inferences as coming from near sentence +cursor.execute(""" + UPDATE results SET source='out_sent' WHERE source IS NULL + """ +) + +#remove non-unique rows +cursor.execute(""" + CREATE TABLE results_new AS (SELECT DISTINCT * FROM results) + """ +) + + +#adopt tmp results table +cursor.execute(""" + DROP TABLE results + """ +) + +cursor.execute(""" + ALTER TABLE results_new RENAME TO results; + """ +) + + +#add serial primary key +cursor.execute(""" + ALTER TABLE results ADD COLUMN result_id serial PRIMARY KEY; + """ +) + +#push updates +connection.commit() + +#list of known and troublesome ligatures +weird_strings = [['\xef\xac\x82', 'fl'], ['\xef\xac\x81', 'fi']] + + +#IMPORT THE RESULTS - SIMPLE CHECK FOR STRAT NAME MENTION VALIDITY +cursor_main = connection.cursor() +cursor_main.execute(""" SELECT * FROM results WHERE strat_flag = 'mention'; """) + +test=[] + +for line in cursor_main: + #collect individual elements from the results dump + target_id, docid, sentid, target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, source, phrase, mention_check, in_ref, result_id = line + checked=[] + + #ligature replacement + for ws in weird_strings: + if ws[0] in phrase: + phrase=phrase.replace(ws[0],ws[1]) + + #find all mentions of strat_phrase_root + matches=[m.start() for m in re.finditer(strat_phrase_root,phrase)] + + #loop through matches + for m in matches: + #lets look at the word that follows the potential strat name + tocheck = phrase[m+len(strat_phrase_root)+1:] + tocheck=tocheck.split(' ') + + #capitalized word following strat name mention invalidates it. Exceptions include: + #1) end of sentence 2) Series 3) parantheses + if tocheck[0].lower()!=tocheck[0] and tocheck[0]!='Series' and tocheck[0][0]!='.' and tocheck[0]!='-LRB-' and tocheck[0]!='-RRB-': + checked.append('no') + else: + checked.append('yes') + + #update post gres table + if 'yes' not in checked: + cursor.execute(""" + UPDATE results SET is_strat_name = %s WHERE result_id = %s;""", + ('no',result_id) + ) + +#push update +connection.commit() + +#close the postgres connection +connection.close() + diff --git a/udf/ext_strat_mentions.py b/udf/ext_strat_mentions.py new file mode 100755 index 0000000..1eae13e --- /dev/null +++ b/udf/ext_strat_mentions.py @@ -0,0 +1,264 @@ +##============================================================================== +## LOOK FOR STRATIGRAPHIC NOMENCLATURE - MENTION RECOGINITION +##============================================================================== + +#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf + +#============================================================================== +# ACQUIRE RELEVANT MODULES +#============================================================================== +import time, urllib2, csv, random, psycopg2, re, yaml +from psycopg2.extensions import AsIs + +#tic +start_time = time.time() + +#function for dowloading CSVs from a URL +def download_csv( url ): + + #return variable + dump_dict = {} + + #get strat_names from Macrostrat API + dump = urllib2.urlopen( url ) + dump = csv.reader(dump) + + #unpack downloaded CSV as list of tuples + #--> length of VARIABLE == number of fields + #--> length of VARIABLE[i] == number of rows + #--> VARIABLE[i][0] = header name + cols = list(zip(*dump)) + + #key names correspond to field names (headers in the CSV file) + for field in cols: + dump_dict[field[0]]=field[1:] + + dump_dict['headers'] = sorted(dump_dict.keys()) + + return dump_dict + +#============================================================================== +# CONNECT TO POSTGRES +#============================================================================== + +# Connect to Postgres +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) +cursor = connection.cursor() + +#initialize mentions +cursor.execute("""DELETE FROM strat_phrases WHERE strat_flag='mention'; +""") + +#import sentences to mine - just restricted to sentences with target instance +cursor.execute(""" + SELECT DISTINCT ON (target_instances.docid, + target_instances.sentid) + + target_instances.docid, + target_instances.sentid, + %(my_app)s_sentences_%(my_product)s.words + FROM %(my_app)s_sentences_%(my_product)s, target_instances + WHERE %(my_app)s_sentences_%(my_product)s.docid = target_instances.docid + AND %(my_app)s_sentences_%(my_product)s.sentid = target_instances.sentid; +""",{ + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()) +}) +sentences=cursor.fetchall() + +#convert list of tuples to list of lists +sentences = [list(elem) for elem in sentences] + +#import docid - strat_name tuples +cursor.execute(""" + SELECT * FROM strat_dict; +""") +connection.commit() + +strat_dict = cursor.fetchall() + +#convert list of tuples to list of lists +strat_dict = [list(elem) for elem in strat_dict] + +#make a dictionary of docid-strat_name tuples +doc_list={} +for i in strat_dict: + doc_list[i[0]]=set(i[1]) + +#============================================================================== +# DEFINE STRATIGRPAHIC VARIABLES +#============================================================================== + +#get interval_names from Macrostrat API +int_dict = download_csv( 'https://macrostrat.org/api/defs/intervals?all&format=csv' ) + +#user-defined variables +with open('./var/strat_variables.txt') as fid: + strat_variables = fid.readlines() + +for i in strat_variables: + exec i + +#PRE-PROCESS: hack to replace weird strings +for idx,line in enumerate(sentences): + for ws in weird_strings: + if ws[0] in ' '.join(sentences[idx][2]): + sentences[idx][2]=[word.replace(ws[0],ws[1]) for word in sentences[idx][2]] + + +#with a dictionary of stratigraphic entites mapped to a given document, find the mentions +# i.e. find 'the Bitter Springs stromatolite' after identifying 'the Bitter Springs Formation' +strat_flag = 'mention' +age_agree='-' + +strat_list=[] + +#loop through documents with discoverd stratigraphic entities +for idx1,doc in enumerate(doc_list.keys()): + #list of sentences data from a given document + target_sents = [k for k in sentences if k[0]==doc] + #list of stratigraphic names associated with that document + target_strat = list(doc_list[doc]) + + + #loop through sentence data per document + for idx2,line in enumerate(target_sents): + doc_id, sent_id, words = line + + sentence = ' '.join(words) + + for name in target_strat: + #parse the (strat_name, strat_name_id) tuple + strat_phrase=name.split(DICT_DELIM)[0] + strat_phrase=strat_phrase.split(' ') + strat_phrase=' '.join(strat_phrase[0:-1]) + + strat_name_id=name.split(DICT_DELIM)[1] + + matches=[m.start() for m in re.finditer(r'\b' + strat_phrase + r'\b',sentence)] + + if matches: + #if at least one match is found, count number of spaces backward to arrive at word index + name_idx = [sentence[0:m].count(' ') for m in matches] + #remove double hits (i.e. stromatolitic-thrombolitic) + name_idx = list(set(name_idx)) + #split the strat mention into parts + name_part = strat_phrase.split(' ') + + #loop through all discoveries + for i in name_idx: + #record it as a mention if: + # 1) it is not at the end of the sentence + # 2) the phrase is not followed by a strat_flag + # (this is to avoid duplication) + # 3) the mention is not part of garbled table e.g. 'Tumbiana Tumbiana Tumbiana Tumbiana' + if i1 and words[i-1] in int_dict['name']: + #record this interval name + int_name=words[i-1] + #list comprehensions to record interval id + locations = [k for k, t in enumerate(int_dict['name']) if t==int_name] + int_id = [int_dict['int_id'][I] for I in locations] + int_id=int_id[0] + + #look to see if there is an age_flag before the mention + elif i>1 and words[i-1] in age_flags: + #record age flag with its preceding word (most likely a number) + int_name = words[i-2] + ' ' + words[i-1] + + #record where mention is found + max_word_id = str(i+len(name_part)) + min_word_id = str(i) + + #add to local variable + strat_list.append('\t'.join(str(x) for x in [idx2, doc_id, sent_id,name.split(DICT_DELIM)[0], strat_phrase,strat_flag, min_word_id, max_word_id, strat_name_id,int_name,int_id, sentence])) + + #write to PSQL table + cursor.execute(""" + INSERT INTO strat_phrases( docid, + sentid, + strat_phrase, + strat_phrase_root, + strat_flag, + phrase_start, + phrase_end, + strat_name_id, + int_name, + int_id, + sentence, + age_agree) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""", + (doc_id, sent_id,name.split(DICT_DELIM)[0], strat_phrase, strat_flag, min_word_id, max_word_id, strat_name_id,int_name,int_id, sentence, age_agree) + ) + +#push insertions to the database +connection.commit() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE strat_phrases; +""") +connection.commit() + +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE target_instances; +""") +connection.commit() + + +#summarize the number of DISTINCT strat_name_roots found in a given sentence +cursor.execute(""" WITH query AS(SELECT docid, sentid, + COUNT(DISTINCT strat_phrase_root) AS count + FROM strat_phrases + GROUP BY docid,sentid) + + UPDATE strat_phrases + SET num_phrase = query.count + FROM query + WHERE strat_phrases.docid = query.docid + AND strat_phrases.sentid = query.sentid + +""") +connection.commit() + +#summarize the number of DISTINCT strat_name_roots found for a given document +cursor.execute(""" WITH query AS(SELECT docid, + COUNT(DISTINCT strat_phrase_root) AS count + FROM strat_phrases + GROUP BY docid) + + UPDATE target_instances + SET num_strat_doc = query.count + FROM query + WHERE target_instances.docid = query.docid +""") +connection.commit() + +#close the postgres connection +connection.close() + +#summary statistic +success = 'number of stratigraphic mentions : %s' %len(strat_list) + +#summary of performance time +elapsed_time = time.time() - start_time +print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time) + +#print out random result +r=random.randint(0,len(strat_list)-1); show = "\n".join(str(x) for x in strat_list[r].split('\t')); print "=========================\n" + show + "\n=========================" + diff --git a/udf/ext_strat_phrases.py b/udf/ext_strat_phrases.py new file mode 100755 index 0000000..1257151 --- /dev/null +++ b/udf/ext_strat_phrases.py @@ -0,0 +1,336 @@ +#============================================================================== +#STRATIGRTAPHIC NAME EXTRACTOR +# ENTITIES = CAPITALIZED WORDS PRECEDING A STRATIGRAPHIC FLAG +# MENTIONS = DEFINED ENTITIES MINUS THE STRATIGRAPHIC FLAG +# +# ENTITY MAPPING DONE ON THE FULL SENTENCES TABLE +# MENTIONS DEFINED BY ENTITIES FOUND IN A GIVEN DOCUMENT +# MENTION MAPPIG DONE ON SENTENCES WITH A TARGET INSTANCE +#============================================================================== + +#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf + +#============================================================================== +# ACQUIRE RELEVANT MODULES +#============================================================================== +import time, urllib2, csv, random, psycopg2, re, string, yaml +from stop_words import get_stop_words +from psycopg2.extensions import AsIs + +#tic +start_time = time.time() + +#============================================================================== +# DEFINE FUNCTION TO DOWNLOAD CSV +#============================================================================== +def download_csv( url ): + + #return variable + dump_dict = {} + + #get strat_names from Macrostrat API + dump = urllib2.urlopen( url ) + dump = csv.reader(dump) + + #unpack downloaded CSV as list of tuples + #--> length of VARIABLE == number of fields + #--> length of VARIABLE[i] == number of rows + #--> VARIABLE[i][0] = header name + cols = list(zip(*dump)) + + #key names correspond to field names (headers in the CSV file) + for field in cols: + dump_dict[field[0]]=field[1:] + + dump_dict['headers'] = sorted(dump_dict.keys()) + + return dump_dict + +#============================================================================== +# CONNECT TO POSTGRES +#============================================================================== +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) +cursor = connection.cursor() + +#initalize the strat_phrases table +cursor.execute(""" + DELETE FROM strat_phrases; +""") + +#IMPORT THE SENTENCES DUMP +cursor.execute(""" + SELECT docid, sentid, words FROM %(my_app)s_sentences_%(my_product)s; +""", { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()) +}) +#sentences=cursor.fetchall() + +#convert list of tuples to list of lists +#sentences = [list(elem) for elem in sentences] + +#push drop/create to the database +connection.commit() + +#============================================================================== +# DEFINE STRATIGRPAHIC VARIABLES +#============================================================================== + +#get strat_names from Macrostrat API +strat_dict = download_csv( 'https://macrostrat.org/api/defs/strat_names?all&format=csv' ) + +#get interval_names from Macrostrat API +int_dict = download_csv( 'https://macrostrat.org/api/defs/intervals?all&format=csv' ) + +#stop words +stop_words = get_stop_words('english') +stop_words = [i.encode('ascii','ignore') for i in stop_words] +alpha = list(string.ascii_lowercase); +alpha_period = [i+'.' for i in alpha] +stop_words = stop_words + ['lower','upper','research'] + alpha + alpha_period + +#STRATIGRAPHIC VARIABLE DEFINITIONS +with open('./var/strat_variables.txt') as fid: + strat_variables = fid.readlines() + +for i in strat_variables: + exec i + +#============================================================================== +# LOOK FOR STRATIGRAPHIC NOMENCLATURE - ENTITY RECOGNITION +#============================================================================== + +#PRE-PROCESS: hack to replace weird strings +changed_docs=[]; + +#initialize the list of found names and list of documents +strat_list=[] +doc_list={} +to_write = [] + +#loop through sentences +for idx,line in enumerate(cursor): + line = list(line) + for ws in weird_strings: + if ws[0] in ' '.join(line[2]): + changed_docs.append([line[0], line[1], ws[0], ws[1]]) + line[2]=[word.replace(ws[0],ws[1]) for word in line[2]] + line = tuple(line) + + #collect individual elements from the psql sentences dump + doc_id, sent_id, words = line + + #initialize the variables needed to analyze words in sentence + i = 0 + complete_phrase = [] + + for word in words: + i += 1 + + #initial assumption is a found strat name will have no age information and no link to Macrostrat + int_name="na" + int_id='0' + strat_name_id = '0' + + #initialize the lists of word indices and stratigraphic phrase words + indices=[] + strat_phrase = [] + + #logic triggered by discovery of 'stratigraphic' flag (i.e. Formation, etc.) + if word in strat_flags: + #record the found word and its index + indices.append(i) + this_word = words[i-1] + + #initialize variables needed for analysis of preceding words + preceding_words=[] + j = 2 + + #loop to identify preceding stratigraphic modifiers on GOOD_WORD (e.g. Wonoka Formation) + #loop continues if: + # 1) the beginning of sentence is not reached + # 2) the preceding string is not empty + # 3) the preceding word is not the current word + # 4) the preceding word is capitalized + # 5) the preceding capitalized word is not a stratigraphic flag (e.g. Member Wonoka Formation) + # 6) the preceding word is not a capitalized stop word + # 7) the preceding word does not contain a number + while (i-j)>(-1) and len(words[i-j])!=0 and words[i-j] != words[i-j+1] and words[i-j][0].isupper() and words[i-j] not in strat_flags and words[i-j].lower() not in stop_words and re.findall(r'\d+', words[i-j])==[]: + #loop also broken if preceding word is an interval name (e.g. Ediacaran Wonoka Formation) + if words[i-j] in int_dict['name']: + #record this interval name + int_name=words[i-j] + + #list comprehensions to record interval id + locations = [k for k, t in enumerate(int_dict['name']) if t==int_name] + int_id = [int_dict['int_id'][I] for I in locations] + int_id=int_id[0] + break + + #loop also broken if preceding word is an age flag (i.e. 580 Ma. Wonoka Formation) + elif words[i-j] in age_flags: + #record age flag with its preceding word (most likely a number) + int_name = words[i-j-1] + ' ' + words[i-j] + break + + #record qualifying preceding words and their indices + preceding_words.append(words[i-j]) + indices.append((i-j)) + j += 1 + + #if qualifying preceding words found, join them to the stratigraphic flag and create a stratigraphic phrase + if preceding_words and len(preceding_words)<4: + #create a full and partial stratigraphic phrase (i.e. with and without the stratigraphic flag) + preceding_words.reverse() + strat_phrase = ' '.join(preceding_words) + ' ' + this_word + strat_phrase_cut = ' '.join(preceding_words) + strat_flag=this_word + + #define term to check against Macrostrat's definitions + # i.e. Bitter Springs for Bitter Springs Formation + # Manlius Limestone for Manlius Limestone + if strat_flag in lith_flags: + strat_phrase_check = strat_phrase + else: + strat_phrase_check = strat_phrase_cut + + #index stratigraphic name to Macrostrat (if present) + if strat_phrase_check in strat_dict['strat_name']: + #list comprehensions to record strat name id (all string matches regardless of inferred rank) + locations = [k for k, t in enumerate(strat_dict['strat_name']) if t==strat_phrase_check] + loc_ids = [strat_dict['strat_name_id'][L] for L in locations] + if loc_ids: + strat_name_id = '~'.join(str(e) for e in loc_ids) + + #beginning and end of stratigraphic phrase + max_word_id = max(indices) + min_word_id = min(indices) + + #create list of stratigraphic phrases found in a given sentence + complete_phrase.append((idx, strat_phrase, strat_phrase_cut,strat_flag, doc_id, sent_id, max_word_id, min_word_id, strat_name_id,int_name,int_id, ' '.join(words))) + + #once sentence has been mined, add finds to growing list of stratigraphic names + for idx,strat_phrase,strat_phrase_cut,strat_flag, doc_id, sent_id, max_word_id, min_word_id, strat_name_id,int_name,int_id, sentence in complete_phrase: + + #dump to local variable + strat_list.append('\t'.join([str(x) for x in [idx, doc_id, sent_id, strat_phrase,strat_phrase_cut, strat_flag, min_word_id, max_word_id, strat_name_id,int_name,int_id, sentence]])) + + #make dictionary of (strat name, strat_name_id), separated by user defined delimiet, per doc id + if doc_id in doc_list.keys(): + doc_list[doc_id].add(strat_phrase+DICT_DELIM+strat_name_id) + else: + doc_list[doc_id]=set([strat_phrase+DICT_DELIM+strat_name_id]) + + to_write.append((doc_id, sent_id, strat_phrase,strat_phrase_cut, strat_flag, min_word_id, max_word_id, strat_name_id,int_name,int_id, sentence)) + +#write to PSQL table +cursor.executemany(""" + INSERT INTO strat_phrases( docid, + sentid, + strat_phrase, + strat_phrase_root, + strat_flag, + phrase_start, + phrase_end, + strat_name_id, + int_name, + int_id, + sentence) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""", to_write) + + +#push insertions +connection.commit() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE strat_phrases; +""") +connection.commit() + +#initalize the strat_dict table +cursor.execute(""" + DELETE FROM strat_dict; +""") + +#write stratigraphic names found in documents to a PSQL table +for idx1,doc in enumerate(doc_list.keys()): + strat_doc = list(doc_list[doc]) + cursor.execute(""" + INSERT INTO strat_dict( docid, + strat_phrase) + VALUES (%s, %s);""", + (doc, strat_doc) + ) + +connection.commit() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE strat_dict; +""") +connection.commit() + +#close the postgres connection +connection.close() + +#summary statistic +success = 'number of stratigraphic entities : %s' %len(strat_list) + +#summary of performance time +elapsed_time = time.time() - start_time +print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time) + +#print out random result +r=random.randint(0,len(strat_list)-1); show = "\n".join(str(x) for x in strat_list[r].split('\t')); print "=========================\n" + show + "\n=========================" + + +#%% OLD CODE +##IMPORT SENTENCES TO MINE +#fid = open('/Users/jhusson/local/bin/deepdive-0.7.1/app/stromatolites/tutorial/input/strat_locations.tsv','r') +#test = fid.readlines() +#fid.close() + +##SPLIT LINE INTO TAB SEPARATED COMPONENTS +#elem = line.split('\t') + +##WRITE DATA TO A FILE +#fid = open('/Users/jhusson/local/bin/deepdive-0.7.1/app/stromatolites/tutorial/input/strat_phrases.tsv','w') +#for item in strat_list: +# fid.write("%s\n" % item) +#fid.close() + +##USEFUL BIT OF CODE FOR LOOKING AT RANDOM SENTENCES +#r=random.randint(0,len(strat_locations)); elem=strat_locations[r].split('\t'); elem[4].replace("~^~"," ") + +##USEFUL BIT OF CODE FOR LOOKING AT RANDOM RESULTS +#r=random.randint(0,len(strat_list)-1); show = "\n".join(str(x) for x in strat_list[r].split('\t')); show=show.replace(ARR_DELIM,' '); print "=========================\n" + show + "\n=========================" + + +##USEFUL BIT OF CODE FOR LOOKING AT ALL RESULTS +#for item in strat_list: +# show = "\n".join(str(x) for x in item.split('\t')) +# print "=========================\n" + show + "\n=========================" +# +#cursor.execute(""" SELECT * from sentences where doc_id='54b43272e138239d8685117b' and sent_id=352 """) +#dump=cursor.fetchall() +# +#cursor.execute(""" SELECT * from sentences where doc_id='54b43289e138239d868552b2' and sent_id=421 """) +#dump=cursor.fetchall() + + + + + diff --git a/udf/ext_strat_target.py b/udf/ext_strat_target.py new file mode 100755 index 0000000..7a9a12e --- /dev/null +++ b/udf/ext_strat_target.py @@ -0,0 +1,269 @@ +#============================================================================== +#DEFINE RELATIONSHIP BETWEEN TARGET ENTITIES AND STRATIGRAPHIC PHRASES +#============================================================================== + +#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf + +#============================================================================== +# ACQUIRE RELEVANT MODULES and DATA +#============================================================================== + +import time, random, psycopg2, yaml +from psycopg2.extensions import AsIs + +#tic +start_time = time.time() + +# Connect to Postgres +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) +cursor = connection.cursor() + +#initalize the strat_target relationship table +cursor.execute(""" + DELETE FROM strat_target; +""") +connection.commit() + +#strat_phrases data dump +cursor.execute(""" + SELECT DISTINCT ON (strat_phrases.docid, + strat_phrases.sentid, + strat_phrase, + phrase_start, + phrase_end) + + strat_phrases.docid, + strat_phrases.sentid, + strat_phrase_root, + strat_flag, + strat_name_id, + phrase_start, + phrase_end, + int_name, + num_phrase, + strat_phrases.sentence, + strat_phrases.age_agree + + FROM strat_phrases, target_instances + WHERE strat_phrases.docid=target_instances.docid + AND strat_phrases.sentid=target_instances.sentid +""") + + +#convert list of tuples to list of lists +strat_list=cursor.fetchall() +strat_list = [list(elem) for elem in strat_list] + +#target_instances data dump +cursor.execute(""" + SELECT target_instances.docid, + target_instances.sentid, + target_word, + target_word_idx, + target_pose, + target_path, + target_parent, + target_children, + %(my_app)s_sentences_%(my_product)s.words, + target_id + FROM target_instances, %(my_app)s_sentences_%(my_product)s + WHERE target_instances.docid=%(my_app)s_sentences_%(my_product)s.docid + AND target_instances.sentid=%(my_app)s_sentences_%(my_product)s.sentid;""", + { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()) +}) + +#convert list of tuples to list of lists +target_instances=cursor.fetchall() +target_instances = [list(elem) for elem in target_instances] + +#============================================================================== +# DEFINING RELATIONSHIP BETWEEN STRATIGRAPHY ENTITY/MENTION AND TARGET +#============================================================================== + +strat_target_list=[] + +#loop through all sentences with strat entities/mentions +for idx, line in enumerate(strat_list): + doc_id, sent_id, strat_phrase_root, strat_flag,strat_name_id,phrase_start,phrase_end,int_name,num_phrase,sentence,age_agree = line + + #grab the target instances for that same sentence + target=[s for k, s in enumerate(target_instances) if s[0]==doc_id and s[1]==sent_id] + + #loop through all target instances in that sentence + for idx2,elem in enumerate(target): + doc_id, sent_id, target_word,target_word_idx,target_pose,target_path,target_parent,target_children,words, target_id = elem + + #is the stratigraphic entity/mention a PARENT or CHILD of the target instance? + if list(set(target_parent) & set(range(phrase_start,phrase_end)))!=[]: + target_relation='parent' + elif list(set(sum(eval(target_children), [])) & set(range(phrase_start,phrase_end)))!=[]: + target_relation='child' + else: + target_relation='na' + + #what is the word DISTANCE between the strat mention/entity and the target instance? + target_distance=[max(target_word_idx)-i for i in range(phrase_start,phrase_end)] + target_distance=target_distance+[min(target_word_idx)-i for i in range(phrase_start,phrase_end)] + + # target found WITHIN the strat phrase (e.g. Upper Stromatolitic Carbonate Member) + if sum(n > 0 for n in target_distance)!=0 and sum(n < 0 for n in target_distance)!=0: + target_distance=0 + #target found BEHIND the strat phrase + elif sum(n > 0 for n in target_distance)==0: + target_distance = max(target_distance) + #target found AHEAD of the strat_phrase + else: + target_distance = min(target_distance) + + #grab the bag of words + if target_distance>1: + words_between = words[phrase_end:phrase_end+(target_distance)] + elif target_distance<-1: + words_between = words[phrase_start+(target_distance):phrase_start] + else: + words_between='{}' + + #dump to local variable + strat_target_list.append([doc_id, sent_id, strat_phrase_root,num_phrase, + target_relation,target_distance,sentence, + strat_flag,phrase_start,phrase_end,int_name, + words_between,target_word,target_word_idx]) + #write to PSQL table + cursor.execute(""" + INSERT INTO strat_target( docid, + sentid, + target_word, + target_word_idx, + strat_phrase_root, + strat_flag, + strat_name_id, + strat_start, + strat_end, + int_name, + num_phrase, + target_relation, + target_distance, + words_between, + sentence, + age_agree, + target_id) + VALUES (%s, %s, %s, %s, %s, + %s, %s, %s, %s, %s, + %s, %s, %s, %s, %s, %s, %s);""", + + (doc_id, sent_id, target_word, + target_word_idx, strat_phrase_root, strat_flag, + strat_name_id,phrase_start,phrase_end, + int_name,num_phrase,target_relation, + target_distance,words_between,sentence,age_agree, target_id) + ) + +connection.commit() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE strat_target; +""") +connection.commit() + +#============================================================================== +# PROVIDE SUMMARIES FOR AGE-AGREEMENT BETWEEN STRAT_PHRASE AND MACROSTRAT STRAT_NAME +#============================================================================== + +#initialize the age_agree column in strat_phrases +cursor.execute(""" + UPDATE strat_target + SET age_sum = '-'; +""") +connection.commit() + +#gather distinct Macrostrat links +cursor.execute(""" + SELECT DISTINCT (strat_name_id) FROM strat_target; +""") + +#convert list of tuples to list of lists +tocheck=cursor.fetchall() +tocheck = [list(elem) for elem in tocheck] + +#find all instances of strat_name_id occuring in the age_check table +cursor.execute(""" + WITH query AS(SELECT DISTINCT (strat_name_id) FROM strat_target) + + SELECT strat_phrases.strat_name_id, strat_phrases.age_agree FROM strat_phrases,query + WHERE strat_phrases.strat_name_id=query.strat_name_id + AND strat_phrases.age_agree<>'-'; + """, +) + +#convert list of tuples to list of lists +results=cursor.fetchall() +results = [list(elem) for elem in results] + +#loop through all strat_name_ids and summarize age agreement discoveries +for idx,name in enumerate(tocheck): + tmp = [i for i in results if i[0]==name[0]] + ids = name[0].split('~') + + #initialize the age agreement list + counts = [[0] * 2 for i in range(len(ids))] + + #loop through all comparisons between a strat_name_id string and interval information + for idx2,item in enumerate(tmp): + #consider each strat_name in the strat_name_string + ans = item[1].split('~') + + #record whether its an allowable or disallowable match + for idx3,data in enumerate(ans): + if data=='yes': + counts[idx3][0]+=1 + elif data=='no': + counts[idx3][1]+=1 + + #record the age agreement summary + tocheck[idx].extend([counts]) + + #variables to push to PSQL database + strat_name_id=name[0] + str_counts=str(counts) + + #write to PSQL table + cursor.execute(""" + UPDATE strat_target + SET age_sum = %s + WHERE strat_name_id = %s;""", + + (str_counts, strat_name_id) + ) + +connection.commit() + + +#summary statistic +success = 'number of strat-target tuples : %s' %len(strat_target_list) + +#summary of performance time +elapsed_time = time.time() - start_time +print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time) + + +#show a random result +r=random.randint(0,len(strat_target_list)-1); show = "\n".join(str(x) for x in strat_target_list[r][0:7]); print "=========================\n" + show + "\n=========================" + +#close the postgres connection +connection.close() + diff --git a/udf/ext_strat_target_distant.py b/udf/ext_strat_target_distant.py new file mode 100755 index 0000000..0093159 --- /dev/null +++ b/udf/ext_strat_target_distant.py @@ -0,0 +1,326 @@ +#============================================================================== +#DEFINE RELATIONSHIP BETWEEN TARGET ENTITIES AND DISTANT STRATIGRAPHIC PHRASES +#============================================================================== + +#============================================================================== +# ACQUIRE RELEVANT MODULES and DATA +#============================================================================== + +import time, random, psycopg2, yaml +from psycopg2.extensions import AsIs + +#tic +start_time = time.time() + +# Connect to Postgres +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) + +cursor = connection.cursor() + +doc_cursor=connection.cursor() +target_cursor=connection.cursor() +strat_cursor = connection.cursor() +sent_cursor = connection.cursor() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE target_instances; +""") +connection.commit() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE strat_phrases; +""") +connection.commit() + +#some sort of magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE %(my_app)s_sentences_%(my_product)s; +""", { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()) +}) +connection.commit() + +#============================================================================== +# FIND STRATIGRAPHIC PHRASES NEAREST TO ORPHAN TARGET INSTANCES +#============================================================================== + +#how many sentences back from orphan to look for stratigraphic phrases +strat_distance=3 + +#initialize the dump variable +strat_target_distant=[] + +#list of docids with orphaned targets +doc_cursor.execute(""" + SELECT DISTINCT ON (target_instances.docid) + + target_instances.docid + FROM target_instances, %(my_app)s_sentences_%(my_product)s + WHERE target_instances.target_id + NOT IN (select strat_target.target_id from strat_target) + AND num_strat_doc<>0 + AND target_instances.docid=%(my_app)s_sentences_%(my_product)s.docid + AND target_instances.sentid=%(my_app)s_sentences_%(my_product)s.sentid + ORDER BY target_instances.docid ASC, target_instances.sentid ASC +""", { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()) +}) + + +#initalize the strat_target_distant relationship table +cursor.execute(""" + DELETE FROM strat_target_distant; +""") +connection.commit() + +#loop through document list +for idx,doc in enumerate(doc_cursor): + #orphaned targets from a given document + target_cursor.execute(""" + SELECT DISTINCT ON (target_instances.docid, + target_instances.sentid, + target_instances.target_word_idx) + + target_instances.docid, + target_instances.sentid, + target_word, + target_word_idx, + target_parent, + target_children, + %(my_app)s_sentences_%(my_product)s.words, + target_id + FROM target_instances, %(my_app)s_sentences_%(my_product)s + WHERE target_instances.target_id + NOT IN (select strat_target.target_id from strat_target) + AND target_instances.docid=%(my_docid)s + AND target_instances.docid=%(my_app)s_sentences_%(my_product)s.docid + AND target_instances.sentid=%(my_app)s_sentences_%(my_product)s.sentid + ORDER BY target_instances.docid ASC, target_instances.sentid ASC +""", { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()), + "my_docid": doc[0] + }) + + #convert list of tuples to list of lists + tmp_target=target_cursor.fetchall() + tmp_target = [list(elem) for elem in tmp_target] + + #define the sentences where those instances come from + sentids = [item[1] for item in tmp_target] + + #gather all stratigraphic phrases from docid that occur before the deepest orphan + sent_query = max(sentids) + + #strat_phrases from document that precede the orphan deepest into the document + strat_cursor.execute(""" + SELECT DISTINCT ON (docid, sentid, strat_phrase_root,strat_name_id) + docid, sentid, strat_phrase_root, strat_flag, num_phrase, strat_name_id,int_name,age_agree from strat_phrases + WHERE docid=%s + AND sentid<%s + ORDER BY sentid ASC;""", + (doc[0], sent_query) + ) + + #convert list of tuples to list of lists + tmp_strat=strat_cursor.fetchall() + tmp_strat = [list(elem) for elem in tmp_strat] + + #loop through the list of orphans + for idx2,target in enumerate(tmp_target): + #define set of variables from this particular orphan + target_sent=target[1] + target_word=target[2] + parent = target[4] + children = list(sum(eval(target[5]), [])) + words = target[6] + target_id=target[7] + + #find all stratigraphic phrases that occur before this orphan and within the defined buffer + strat_find = [item[1] for item in tmp_strat if target_sent-item[1]<=strat_distance and target_sent-item[1]>0] + + #if candidate strat_phrase(s) are found + if strat_find: + #selet the closest sentence with phrase(s) + strat_find=max(strat_find) + #collect all the strat_phrase(s) in that sentence + strat_info = [item for item in tmp_strat if item[1]==strat_find] + + #define the sentids for sentences that bridge the strat_phrase(s) to the orphan + sent_inbetween=range(strat_find,target[1]+1) + + #collect the words between strat_phrases and orphaned target + sent_cursor.execute(""" + SELECT docid, sentid, words from %(my_app)s_sentences_%(my_product)s + WHERE docid=%(my_docid)s + AND sentid=ANY(%(my_sentid)s) + ORDER BY sentid ASC;""", + { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()), + "my_docid": doc[0], + "my_sentid": sent_inbetween + } + ) + + #convert list of tuples to list of lists + words_between=sent_cursor.fetchall() + words_between = [list(elem) for elem in words_between] + words_between = [' '.join(item[2]) for item in words_between] + words_between = ''.join(words_between) + + #define the distance between orphan and strat_phrase(s) sentence + target_distance = target[1]-strat_find + + #define grammatical parent and children (as words) of the orphan + parent = [words[i] for i in parent] + children = [words[i] for i in children] + + #loop through all the strat_phrases found in the nearest host sentence + for match in strat_info: + #info about the strat_phrase + [docid, sentid, strat_phrase_root, + strat_flag, num_phrase, strat_name_id, + int_name, age_agree] = match + + toadd=[docid, sentid, strat_phrase_root, + strat_flag, num_phrase, strat_name_id, + int_name, age_agree, target_distance, + target_id,target_word,parent,children, + words_between] + #dump to local variable + strat_target_distant.append(toadd) + #write to psql table + cursor.execute(""" + INSERT INTO strat_target_distant( docid, + sentid, + strat_phrase_root, + strat_flag, + num_phrase, + strat_name_id, + int_name, + age_agree, + target_sent_dist, + target_id, + target_word, + target_parent, + target_children, + words_between) + VALUES (%s, %s, %s, %s, %s, + %s, %s, %s, %s, %s, + %s, %s, %s, %s);""", + + (docid, sentid, strat_phrase_root, + strat_flag, num_phrase, strat_name_id, + int_name, age_agree, target_distance, + target_id,target_word,parent,children, + words_between) + ) + +#push the insertions +connection.commit() + + +#============================================================================== +# PROVIDE SUMMARIES FOR AGE-AGREEMENT BETWEEN STRAT_PHRASE AND MACROSTRAT STRAT_NAME +#============================================================================== + +#initialize the age_agree column in strat_phrases +cursor.execute(""" + UPDATE strat_target_distant + SET age_sum = '-'; +""") +connection.commit() + +#gather distinct Macrostrat links +cursor.execute(""" + SELECT DISTINCT (strat_name_id) FROM strat_target_distant; +""") + +#convert list of tuples to list of lists +tocheck=cursor.fetchall() +tocheck = [list(elem) for elem in tocheck] + +#find all instances of strat_name_id occuring in the age_check table +cursor.execute(""" + WITH query AS(SELECT DISTINCT (strat_name_id) FROM strat_target_distant) + + SELECT strat_phrases.strat_name_id, strat_phrases.age_agree FROM strat_phrases,query + WHERE strat_phrases.strat_name_id=query.strat_name_id + AND strat_phrases.age_agree<>'-'; + """, +) + +#convert list of tuples to list of lists +results=cursor.fetchall() +results = [list(elem) for elem in results] + +#loop through all strat_name_ids and summarize age agreement discoveries +for idx,name in enumerate(tocheck): + tmp = [i for i in results if i[0]==name[0]] + ids = name[0].split('~') + + #initialize the age agreement list + counts = [[0] * 2 for i in range(len(ids))] + + #loop through all comparisons between a strat_name_id string and interval information + for idx2,item in enumerate(tmp): + #consider each strat_name in the strat_name_string + ans = item[1].split('~') + + #record whether its an allowable or disallowable match + for idx3,data in enumerate(ans): + if data=='yes': + counts[idx3][0]+=1 + elif data=='no': + counts[idx3][1]+=1 + + #record the age agreement summary + tocheck[idx].extend([counts]) + + #variables to push to PSQL database + strat_name_id=name[0] + str_counts=str(counts) + + #write to PSQL table + cursor.execute(""" + UPDATE strat_target_distant + SET age_sum = %s + WHERE strat_name_id = %s;""", + + (str_counts, strat_name_id) + ) + +connection.commit() + + +#summary statistic +success = 'number of strat-distant target tuples : %s' %len(strat_target_distant) + +#toc +elapsed_time = time.time() - start_time +print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time) + + +#show a random result +r=random.randint(0,len(strat_target_distant)-1); show = "\n".join(str(x) for x in strat_target_distant[r]); print "=========================\n" + show + "\n=========================" + +#close the postgres connection +connection.close() + diff --git a/udf/ext_target.py b/udf/ext_target.py new file mode 100755 index 0000000..152c7c2 --- /dev/null +++ b/udf/ext_target.py @@ -0,0 +1,166 @@ +#============================================================================== +#TARGET NAME EXTRACTOR +#============================================================================== + +#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf + +#============================================================================== +# import relevant modules and data +#============================================================================== +import time, random, re, yaml, psycopg2 +from psycopg2.extensions import AsIs + +start_time = time.time() + +# Connect to Postgres +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) +cursor = connection.cursor() + +#initalize the target_instances table +cursor.execute(""" + DELETE FROM target_instances; +""") + +#IMPORT THE SENTENCES DUMP +cursor.execute(""" + SELECT docid, sentid, words, poses, dep_paths, dep_parents FROM %(my_app)s_sentences_%(my_product)s; +""", { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()) +}) + +#push drop/create to the database +connection.commit() + + +#initalize list of target occurences +target_list=[] + +#TARGET DEFINITIONS +with open('./var/target_variables.txt') as fid: + target_variables = fid.readlines() + +for i in target_variables: + exec i + +#loop through all sentences. +to_write = [] +for line in cursor: + #collect individual elements from the psql sentences dump + docid, sentid, words, poses, dep_paths, dep_parents = line + + #initialize list of local target occurences + targets = [] + + #sentence string + sent = ' '.join(words) + + #loop through all the target names + for name in target_names: + #starting index of all matches for a target_name in the joined sentence + matches=[m.start() for m in re.finditer(name,sent.lower())] + + if matches: + #if at least one match is found, count number of spaces backward to arrive at word index + indices = [sent[0:m].count(' ') for m in matches] + #remove double hits (i.e. stromatolitic-thrombolitic) + indices = list(set(indices)) + #target_name spans its starting word index to the number of words in the phrase + target_word_idx = [[i,i+len(name.split(' '))] for i in indices] + + #initialize other data about a found target_name + target_pose=[] + target_path=[] + target_parent=[] + + for span in target_word_idx: + #poses, paths and parents can be found at same indices of a target_name find + target_word = ' '.join(words[span[0]:span[1]]) + + if target_word.lower() not in bad_words: + target_children=[] + target_pose = poses[span[0]:span[1]] + target_path = dep_paths[span[0]:span[1]] + target_parent = dep_parents[span[0]:span[1]] + + #children of each component of a target_name + for span_idx in range(span[0], span[1]): + children = [j for j,i in enumerate(dep_parents) if i==span_idx+1] + target_children.append(children) + + #convert parent_ids to Pythonic ids + target_parent = [i-1 for i in target_parent] + + #add finds to a local variable + target_list.append([docid, sentid, target_word, span, target_pose, target_path, target_parent, target_children, sent]) + + #for easier storage, convert list of target_children lists to a string + str_target_children = str(target_children) + + #write to PSQL table + to_write.append( + (docid, sentid, target_word, span, target_pose, target_path, target_parent, str_target_children, sent) + ) + +cursor.executemany(""" +INSERT INTO target_instances( docid, + sentid, + target_word, + target_word_idx, + target_pose, + target_path, + target_parent, + target_children, + sentence) +VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);""", +to_write +) + +#push insertions to the database +connection.commit() + +#restart the primary key +cursor.execute(""" + ALTER TABLE target_instances DROP target_id; +""") + +#push drop/create to the database +connection.commit() + +#add primary key +cursor.execute(""" ALTER TABLE target_instances ADD COLUMN target_id SERIAL PRIMARY KEY; +""") +connection.commit() + + +#do some magic +connection.set_isolation_level(0) +cursor.execute(""" VACUUM ANALYZE target_instances; +""") +connection.commit() + +#close the connection +connection.close() + +#summary statistic +success = 'number of target instances: %s' %len(target_list) + +#summary of performance time +elapsed_time = time.time() - start_time +print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time) + + +#USEFUL BIT OF CODE FOR LOOKING AT RANDOM RESULTS +r=random.randint(0,len(target_list)-1); print "=========================\n"; print("\n".join(str(target) for target in target_list[r])); print "\n=========================" diff --git a/udf/ext_target_adjective.py b/udf/ext_target_adjective.py new file mode 100755 index 0000000..0133ddc --- /dev/null +++ b/udf/ext_target_adjective.py @@ -0,0 +1,100 @@ +#============================================================================== +#TARGET ADJECTIVE EXTRACTOR +#============================================================================== + +#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf + +#============================================================================== +# import relevant modules and data +#============================================================================== +import time, random, re, yaml, psycopg2 +from psycopg2.extensions import AsIs + +start_time = time.time() + +# Connect to Postgres +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) +cursor = connection.cursor() + +#IMPORT TARGETS WITH DEPENDENTS +cursor.execute(""" + SELECT docid, sentid, target_id, target_word, target_children + + FROM target_instances + WHERE target_children<>'[[]]'; +""") + +target=cursor.fetchall() + + +#IMPORT THE SENTENCES DUMP +cursor.execute(""" + WITH temp as ( + SELECT DISTINCT ON (docid, sentid) docid, sentid + FROM target_instances + WHERE target_children<>'[[]]' + ) + + + SELECT s.docid, s.sentid, words, poses + FROM %(my_app)s_sentences_%(my_product)s AS s + + JOIN temp ON temp.docid=s.docid AND temp.sentid=s.sentid; + """, { + "my_app": AsIs(config['app_name']), + "my_product": AsIs(config['product'].lower()) + }) + +sentences=cursor.fetchall() + +#initalize the target_instances table +cursor.execute(""" + DELETE FROM target_adjectives; +""") + +#push drop/create to the database +connection.commit() + + +adj=[] +for idx,line in enumerate(target): + docid, sentid, target_id, target_word, target_children = line + target_children = eval(target_children) + target_children =target_children[0] + + sent = [elem for elem in sentences if elem[0]==docid and elem[1]==sentid] + + for c in target_children: + if sent[0][3][c]=='JJ': + adj.append([docid, sentid, target_id, target_word, sent[0][2][c]]) + + #write to PSQL table + cursor.execute(""" + INSERT INTO target_adjectives( docid, + sentid, + target_id, + target_word, + target_adjective) + VALUES (%s, %s, %s, %s, %s);""", + (docid, sentid, target_id, target_word, sent[0][2][c]) + ) + if c<0: + print 'something is up!' + +#push insertions to the database +connection.commit() + +#close the connection +connection.close() diff --git a/udf/initdb.py b/udf/initdb.py new file mode 100755 index 0000000..d723f1c --- /dev/null +++ b/udf/initdb.py @@ -0,0 +1,194 @@ +#============================================================================== +#INITIALIZE POSTGRES TABLES +#============================================================================== + +#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf + +#============================================================================== + +import yaml +import psycopg2 +from psycopg2.extensions import AsIs + +# Connect to Postgres +with open('./credentials', 'r') as credential_yaml: + credentials = yaml.load(credential_yaml) + +with open('./config', 'r') as config_yaml: + config = yaml.load(config_yaml) + +# Connect to Postgres +connection = psycopg2.connect( + dbname=credentials['postgres']['database'], + user=credentials['postgres']['user'], + password=credentials['postgres']['password'], + host=credentials['postgres']['host'], + port=credentials['postgres']['port']) +cursor = connection.cursor() + +#SENTENCES TABLE +#DROP TABLE IF EXISTS sentences CASCADE; +#CREATE TABLE sentences (docid text, sentid integer, wordidx integer[], words text[], poses text[], ners text[], lemmas text[], dep_paths text[], dep_parents integer[]); +#COPY sentences FROM '/Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/input/strom_nlp352'; + + +#TARGET_INSTANCES +cursor.execute(""" + DROP TABLE IF EXISTS target_instances CASCADE; + CREATE TABLE target_instances( + target_id serial PRIMARY KEY, + docid text, + sentid int, + target_word text, + num_strat_doc int DEFAULT 0, + target_word_idx int[], + target_pose text[], + target_path text[], + target_parent int[], + target_children text, + sentence text); +""") +connection.commit() + +#TARGET_ADJECTIVES +cursor.execute(""" + DROP TABLE IF EXISTS target_adjectives CASCADE; + CREATE TABLE target_adjectives( + docid text, + sentid int, + target_id int, + target_word text, + target_adjective text); +""") +connection.commit() + +#STRAT_PHRASES +cursor.execute(""" + DROP TABLE IF EXISTS strat_phrases CASCADE; + CREATE TABLE strat_phrases( + docid text, + sentid int, + strat_phrase text, + strat_phrase_root text, + num_phrase int, + sentence text, + strat_flag text, + phrase_start int, + phrase_end int, + strat_name_id text, + int_name text, + int_id int, + age_agree text DEFAULT '-'); +""") +connection.commit() + +#STRAT_DICT +cursor.execute(""" + DROP TABLE IF EXISTS strat_dict CASCADE; + CREATE TABLE strat_dict( + docid text, + strat_phrase text[]); +""") +connection.commit() + + +#STRAT_TARGET +cursor.execute(""" + DROP TABLE IF EXISTS strat_target CASCADE; + CREATE TABLE strat_target( + docid text, + sentid int, + refs_loc int, + in_ref text DEFAULT 'no', + strat_phrase_root text, + num_phrase int, + target_relation text, + target_distance int, + sentence text, + strat_flag text, + strat_name_id text, + strat_start int, + strat_end int, + int_name text, + age_agree text DEFAULT '-', + age_sum text DEFAULT '-', + words_between text[], + target_word text, + target_word_idx int[], + target_id int + ); +""") +connection.commit() + +#AGE CHECK +cursor.execute(""" + DROP TABLE IF EXISTS age_check CASCADE; + CREATE TABLE age_check( + strat_phrase_root text, + strat_flag text, + strat_name_id text, + int_name text, + int_id int, + age_agree text); +""") +connection.commit() + +#STRAT_TARGET_DISTANT +cursor.execute(""" + DROP TABLE IF EXISTS strat_target_distant CASCADE; + CREATE TABLE strat_target_distant( + docid text, + sentid int, + refs_loc int, + in_ref text DEFAULT 'no', + strat_phrase_root text, + strat_flag text, + num_phrase int, + int_name text, + strat_name_id text, + age_agree text DEFAULT '-', + age_sum text DEFAULT '-', + words_between text, + target_sent_dist int, + target_word text, + target_parent text [], + target_children text [], + target_id int + ); +""") +connection.commit() + + +#BIB +cursor.execute(""" + DROP TABLE IF EXISTS bib CASCADE; + CREATE TABLE bib( + docid text, + author text[], + title text, + journal text, + url text, + journal_instances int + ); +""") +connection.commit() + +#RESULTS +cursor.execute(""" + DROP TABLE IF EXISTS results CASCADE; + CREATE TABLE results( + target_id int, + docid text, + sentid int, + target_word text, + strat_phrase_root text, + strat_name_id text, + age_sum text, + source text, + phrase text + ); +""") +connection.commit() + +# Disconnect from Postgres +connection.close() diff --git a/var/strat_variables.txt b/var/strat_variables.txt new file mode 100755 index 0000000..30994c2 --- /dev/null +++ b/var/strat_variables.txt @@ -0,0 +1,19 @@ +#============================================================================== +# DEFINE STRATIGRPAHIC VARIABLES +#============================================================================== + +#delimiter to separate strat_entities from strat_name_ids in strat_dict +DICT_DELIM='$$$' + +#words indicating stratigraphic names +strat_flags = ["Group", "Formation", "Member", "Supergroup", "Bed", "Subgroup","Gp.", "Fm.", "Mbr.", "SGp.", "Gp", "Fm", "Mbr", "SGp"] + +lith_flags = ["Dolomite","Dolostone","Limestone","Sandstone","Shale","Conglomerate","Chert"] + +strat_flags = strat_flags+lith_flags + +#words indicating an age +age_flags = ["Ma.", "Ga.", "Myr.","Ma", "Ga", "Myr"] + +#list of known and troublesome ligatures +weird_strings = [['\xef\xac\x82', 'fl'], ['\xef\xac\x81', 'fi']] diff --git a/var/target_variables.txt b/var/target_variables.txt new file mode 100755 index 0000000..82ed5fd --- /dev/null +++ b/var/target_variables.txt @@ -0,0 +1,11 @@ +#============================================================================== +# DEFINE TARGET VARIABLES +#============================================================================== + +#each string in this list will define a regular expression search +# EXAMPLE: [r'\b' + ooid + r'\b', r'\b' + ooids + r'\b'] +# will find all instances of 'ooid' or 'ooids' bound by a non-alphanumeric character +target_names = ['stromatol', 'thrombol'] + +#an optional list of false hits +bad_words = ['non-stromatolitic','nonstromatolitic','non-stromatolite'] \ No newline at end of file