From 06c4508a9c6fd35c47cce34f87abe524e115fb72 Mon Sep 17 00:00:00 2001
From: jonhusson <husson@wisc.edu>
Date: Fri, 16 Dec 2016 16:33:17 -0800
Subject: [PATCH] intial commit

---
 .gitignore                      |   4 +
 README.md                       |  91 +++++++++
 config                          |  17 ++
 credentials.example             |   6 +
 extractions/SQL.txt             |  31 +++
 input/url.txt                   |   1 +
 makefile                        |   8 +
 output/.gitignore               |   2 +
 requirements.txt                |   6 +
 run.py                          |  78 ++++++++
 setup/setup.sh                  |  53 +++++
 udf/buildbib.py                 | 104 ++++++++++
 udf/ext_age_check.py            | 254 ++++++++++++++++++++++++
 udf/ext_references.py           | 219 +++++++++++++++++++++
 udf/ext_results.py              | 170 ++++++++++++++++
 udf/ext_strat_mentions.py       | 264 +++++++++++++++++++++++++
 udf/ext_strat_phrases.py        | 336 ++++++++++++++++++++++++++++++++
 udf/ext_strat_target.py         | 269 +++++++++++++++++++++++++
 udf/ext_strat_target_distant.py | 326 +++++++++++++++++++++++++++++++
 udf/ext_target.py               | 166 ++++++++++++++++
 udf/ext_target_adjective.py     | 100 ++++++++++
 udf/initdb.py                   | 194 ++++++++++++++++++
 var/strat_variables.txt         |  19 ++
 var/target_variables.txt        |  11 ++
 24 files changed, 2729 insertions(+)
 create mode 100755 .gitignore
 create mode 100755 README.md
 create mode 100755 config
 create mode 100755 credentials.example
 create mode 100755 extractions/SQL.txt
 create mode 100755 input/url.txt
 create mode 100755 makefile
 create mode 100755 output/.gitignore
 create mode 100755 requirements.txt
 create mode 100755 run.py
 create mode 100755 setup/setup.sh
 create mode 100755 udf/buildbib.py
 create mode 100755 udf/ext_age_check.py
 create mode 100755 udf/ext_references.py
 create mode 100755 udf/ext_results.py
 create mode 100755 udf/ext_strat_mentions.py
 create mode 100755 udf/ext_strat_phrases.py
 create mode 100755 udf/ext_strat_target.py
 create mode 100755 udf/ext_strat_target_distant.py
 create mode 100755 udf/ext_target.py
 create mode 100755 udf/ext_target_adjective.py
 create mode 100755 udf/initdb.py
 create mode 100755 var/strat_variables.txt
 create mode 100755 var/target_variables.txt

diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..2cbe8df
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.DS_Store
+
+credentials
+*.swp
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..96a8188
--- /dev/null
+++ b/README.md
@@ -0,0 +1,91 @@
+# GeoDeepDive Application Template
+A template for building applications for [GeoDeepDive](https://geodeepdive.org)
+
+## Getting started
+Dependencies:
+  + [GNU Make](https://www.gnu.org/software/make/)
+  + [git](https://git-scm.com/)
+  + [pip](https://pypi.python.org/pypi/pip)
+  + [PostgreSQL](http://www.postgresql.org/)
+
+### OS X
+OS X ships with GNU Make, `git`, and Python, but you will need to install `pip` and PostgreSQL.
+
+To install `pip`:
+````
+sudo easy_install pip
+````
+
+To install PostgreSQL, it is recommended that you use [Postgres.app](http://postgresapp.com/). Download
+the most recent version, and be sure to follow [the instructions](http://postgresapp.com/documentation/cli-tools.html)
+for setting up the command line tools, primarily adding the following line to your `~/.bash_profile`:
+
+````
+export PATH=$PATH:/Applications/Postgres.app/Contents/Versions/latest/bin
+````
+
+
+### Setting up the project
+First, clone this repository and run the setup script:
+
+````
+git clone https://github.com/UW-DeepDiveInfrastructure/app-template
+cd app-template
+make
+````
+
+Edit `credentials` with the connection credentials for your local Postgres database.
+
+To create a database with the data included in `/setup/usgs_example`:
+
+````
+make local_setup
+````
+
+To run an example, run `python run.py`.
+
+## Running on GeoDeepDive Infrastructure
+All applications are required to have the same structure as this repository, namely an empty folder named `output`, a valid
+`config` file, an updated `requirements.txt` describing any Python dependencies, and `run.py` which runs the application
+and outputs results. The `credentials` file will be ignored and substituted with a unique version at run time.
+
+The GeoDeepDive infrastructure will have the following software available:
+  + Python 2.7+ (Python 3.x not supported at this time)
+  + PostgreSQL 9.4+, including command line tools and PostGIS
+
+#### Submitting a config file
+The `config` file outlines a list of terms OR dictionaries that you are interested in culling from the corpus. Once you have
+updated this file, a private repository will be set up for you under the UW-DeepDiveInfrastructure Github group for you to
+push the code from this repository to. Your `config` file will be used to generate a custom testing subset of documents that
+you can use to develop your application.
+
+#### Running the application
+Once you have developed your application and tested it against the corpus subset, simply push your application to the
+private repository created in the previous step. The application will then be run according to the parameters set in the
+`config` file.
+
+#### Getting results
+After the application is run, the contents of the `output` folder will be gzipped and be made available to download. If
+an error was encountered or your application did not run successfully any errors thrown will be logged into the file
+`errors.txt` which is included in the gzipped results package.
+
+## File Summary
+
+#### config
+A YAML file that contains project settings.
+
+
+#### credentials
+A YAML file that contains local postgres credentials for testing and generating examples.
+
+
+#### requirements.txt
+List of Python dependencies to be installed by `pip`
+
+
+#### run.py
+Python script that runs the entire application, including any setup tasks and exporting of results to the folder `/output`.
+
+
+## License
+CC-BY 4.0 International
diff --git a/config b/config
new file mode 100755
index 0000000..fe1a90a
--- /dev/null
+++ b/config
@@ -0,0 +1,17 @@
+# The name of the application (no spaces)
+app_name: strom
+
+# First and last name of the user
+user: Jon Husson
+
+# The NLP product to run the application against
+product: NLP352
+
+# How often the application should be run
+frequency: monthly
+
+# A list of terms used to subset the corpus
+terms: [stromatolite, stromatolitic, thrombolite, thrombolitic]
+
+# Stored dictionary of terms, to be set by GDD infrastructure admins
+dictionary: strom
diff --git a/credentials.example b/credentials.example
new file mode 100755
index 0000000..82178b7
--- /dev/null
+++ b/credentials.example
@@ -0,0 +1,6 @@
+postgres:
+    user: postgres_username
+    port: 5432
+    host: localhost
+    database: deepdive_app
+    password: password123
diff --git a/extractions/SQL.txt b/extractions/SQL.txt
new file mode 100755
index 0000000..5b17172
--- /dev/null
+++ b/extractions/SQL.txt
@@ -0,0 +1,31 @@
+#==============================================================================
+# PG DUMP FOR RESULTS
+#==============================================================================
+
+pg_dump -t results -t strat_target -t strat_target_distant -t age_check -t bib -t target_adjectives DBNAME > ./output/output.sql
+
+#==============================================================================
+# CREATE (ALREADY PRESENT) DATABASE FROM DUMP
+#==============================================================================
+
+psql -d DBNAME -f ../output/output.sql
+
+#==============================================================================
+# USEFUL SQL QUERIES FOR SUMMARY RESULTS
+#==============================================================================
+
+COPY(SELECT strat_phrase_root,strat_name_id, COUNT(strat_name_id)
+	FROM results 
+	WHERE (strat_name_id<>'0' AND target_word ILIKE '%stromato%') 
+	GROUP BY strat_phrase_root, strat_name_id)
+	TO '/Users/jhusson/Box Sync/postdoc/deepdive/stroms/V2/test.csv' DELIMITER ',' CSV HEADER;
+
+#==============================================================================
+# INTERESTING STROMATOLITE ADJECTIVES
+#==============================================================================
+
+SELECT * from target_adjectives WHERE target_adjective ILIKE 'domal' OR
+target_adjective ILIKE 'columnar' OR
+target_adjective ILIKE 'conical' OR
+target_adjective ILIKE 'domical' OR
+target_adjective ILIKE 'domed'
\ No newline at end of file
diff --git a/input/url.txt b/input/url.txt
new file mode 100755
index 0000000..e9231e7
--- /dev/null
+++ b/input/url.txt
@@ -0,0 +1 @@
+deepdivesubmit.chtc.wisc.edu/static/strom_nlp_27Jan2016.zip
diff --git a/makefile b/makefile
new file mode 100755
index 0000000..ff24788
--- /dev/null
+++ b/makefile
@@ -0,0 +1,8 @@
+all:
+	cp credentials.example credentials;
+	pip install -r requirements.txt;
+
+
+
+local_setup:
+	./setup/setup.sh
diff --git a/output/.gitignore b/output/.gitignore
new file mode 100755
index 0000000..d6b7ef3
--- /dev/null
+++ b/output/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/requirements.txt b/requirements.txt
new file mode 100755
index 0000000..1070da6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+psycopg2>=2.6.1
+pyyaml>=3.11
+tqdm>=1.0
+stop-words>=2015.2.23.1
+docopt>=0.6.1
+numpy>=1.9.2
\ No newline at end of file
diff --git a/run.py b/run.py
new file mode 100755
index 0000000..ed3c80d
--- /dev/null
+++ b/run.py
@@ -0,0 +1,78 @@
+#==============================================================================
+#RUN ALL  - STROMATOLITES
+#==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites
+
+#==============================================================================
+
+import os, time, subprocess, yaml
+
+#tic
+start_time = time.time()
+
+#load configuration file
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+#load credentials file
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+    
+    
+#ensure working directory is proper
+#os.chdir("/Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites")
+
+#INITALIZE THE POSTGRES TABLES
+print 'Step 1: Initialize the PSQL tables ...'
+subprocess.call('./setup/setup.sh', shell=True)
+os.system('python ./udf/initdb.py')
+
+#BUILD THE BIBLIOGRAPHY
+print 'Step 2: Build the bibliography ...'
+os.system('python ./udf/buildbib.py')
+
+#FIND TARGET INSTANCES
+print 'Step 3: Find stromatolite instances ...'
+os.system('python ./udf/ext_target.py')
+
+#FIND STRATIGRAPHIC ENTITIES
+print 'Step 4: Find stratigraphic entities ...'
+os.system('python ./udf/ext_strat_phrases.py')
+
+#FIND STRATIGRAPHIC MENTIONS
+print 'Step 5: Find stratigraphic mentions ...'
+os.system('python ./udf/ext_strat_mentions.py')
+
+#CHECK AGE - UNIT MATCH AGREEMENT
+print 'Step 6: Check age - unit match agreement ...'
+os.system('python ./udf/ext_age_check.py')
+
+#DEFINE RELATIONSHIPS BETWEEN TARGET AND STRATIGRAPHIC NAMES
+print 'Step 7: Define the relationships between stromatolite phrases and stratigraphic entities/mentions ...'
+os.system('python ./udf/ext_strat_target.py')
+
+#DEFINE RELATIONSHIPS BETWEEN TARGET AND DISTANT STRATIGRAPHIC NAMES
+print 'Step 8: Define the relationships between stromatolite phrases and distant stratigraphic entities/mentions ...'
+os.system('python ./udf/ext_strat_target_distant.py')
+
+#DEFINE RELATIONSHIPS BETWEEN TARGET AND DISTANT STRATIGRAPHIC NAMES
+print 'Step 9: Delineate reference section from main body extractions ...'
+os.system('python ./udf/ext_references.py')
+
+#BUILD A BEST RESULTS TABLE OF STROM-STRAT_NAME TUPLES
+print 'Step 10: Build a best results table of strom-strat_name tuples ...'
+os.system('python ./udf/ext_results.py')
+
+#FIND ADJECTIVES DESCRIBING STROM
+print 'Step 11: Find adjectives describing strom target words ...'
+os.system('python ./udf/ext_target_adjective.py')
+
+#POSTGRES DUMP
+print 'Step 12: Dump select results from PSQL ...'
+output = 'pg_dump -U '+ credentials['postgres']['user'] + ' -t results -t strat_target -t strat_target_distant -t age_check -t refs_location -t bib -t target_adjectives -d ' + credentials['postgres']['database'] + ' > ./output/output.sql'
+subprocess.call(output, shell=True)
+
+#summary of performance time
+elapsed_time = time.time() - start_time
+print '\n ###########\n\n elapsed time: %d seconds\n\n ###########\n\n' %(elapsed_time)
diff --git a/setup/setup.sh b/setup/setup.sh
new file mode 100755
index 0000000..f388a94
--- /dev/null
+++ b/setup/setup.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# via http://stackoverflow.com/a/21189044/1956065
+function parse_yaml {
+   local prefix=$2
+   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
+   sed -ne "s|^\($s\):|\1|" \
+        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
+        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
+   awk -F$fs '{
+      indent = length($1)/2;
+      vname[indent] = $2;
+      for (i in vname) {if (i > indent) {delete vname[i]}}
+      if (length($3) > 0) {
+         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
+         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
+      }
+   }'
+}
+
+eval $(parse_yaml credentials)
+eval $(parse_yaml config)
+
+export PGPASSWORD=$postgres__password
+
+pwd=$(pwd)
+
+# Create the database - if it exists an error will be thrown which can be ignored
+createdb $postgres__database -h $postgres__host -U $postgres__user -p $postgres__port
+
+# Vanilla NLP
+echo "DROP TABLE IF EXISTS ${app_name}_sentences_nlp; CREATE TABLE ${app_name}_sentences_nlp (docid text, sentid integer, wordidx integer[], words text[], poses text[], ners text[], lemmas text[], dep_paths text[], dep_parents integer[], font text[], layout text[]);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+
+echo "CREATE INDEX ON ${app_name}_sentences_nlp (docid);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+echo "CREATE INDEX ON ${app_name}_sentences_nlp (sentid);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+
+echo "COPY ${app_name}_sentences_nlp FROM '$pwd/input/sentences_nlp'" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+
+# NLP352
+echo "DROP TABLE IF EXISTS ${app_name}_sentences_nlp352; CREATE TABLE ${app_name}_sentences_nlp352 (docid text, sentid integer, wordidx integer[], words text[], poses text[], ners text[], lemmas text[], dep_paths text[], dep_parents integer[]);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+
+echo "CREATE INDEX ON ${app_name}_sentences_nlp352 (docid);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+echo "CREATE INDEX ON ${app_name}_sentences_nlp352 (sentid);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+
+echo "COPY ${app_name}_sentences_nlp352 FROM '$pwd/input/sentences_nlp352'" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+
+# NLP352 Bazaar
+echo "DROP TABLE IF EXISTS ${app_name}_sentences_nlp352_bazaar; CREATE TABLE ${app_name}_sentences_nlp352_bazaar (docid text, sentid integer, sentence text, words text[], lemmas text[], poses text[], ners text[], character_position integer[], dep_paths text[], dep_parents integer[]);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+
+echo "CREATE INDEX ON ${app_name}_sentences_nlp352_bazaar (docid);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+echo "CREATE INDEX ON ${app_name}_sentences_nlp352_bazaar (sentid);" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
+
+echo "COPY ${app_name}_sentences_nlp352_bazaar FROM '$pwd/input/sentences_nlp352_bazaar'" | psql -U $postgres__user -h $postgres__host -p $postgres__port $postgres__database
diff --git a/udf/buildbib.py b/udf/buildbib.py
new file mode 100755
index 0000000..8015861
--- /dev/null
+++ b/udf/buildbib.py
@@ -0,0 +1,104 @@
+#==============================================================================
+#BUILD A BASIC BIBLIOGRAPHY
+#==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf
+
+#==============================================================================
+
+import json,psycopg2, yaml
+
+# Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+#initialize the table
+cursor.execute("""
+    DELETE FROM bib
+    """)
+
+connection.commit()
+
+#load in the bibJSON file
+with open('./input/bibjson') as fid:
+    bib=json.load(fid)
+
+#push docid, authors, title, journal name and url to PostGRES
+for idx,item in enumerate(bib):
+
+    #initialize the variables to push to psql
+    docid=[]
+    title=[]
+    journal=[]
+    names=[]
+    url =[]
+
+    #as failsafe, always check if each variable exists
+    if isinstance(item['_gddid'],unicode):
+        docid=item['_gddid'].encode('ascii','ignore')
+    else:
+        docid=item['_gddid']
+
+    if isinstance(item['title'],unicode):
+        title=item['title'].encode('ascii','ignore')
+    else:
+        title=item['title']
+
+    if isinstance(item['journal']['name'],unicode):
+        journal=item['journal']['name'].encode('ascii','ignore')
+    else:
+        journal=item['journal']['name']
+
+    if 'author' in item.keys():
+        for name in item['author']:
+            names.append(name['name'].encode('ascii','ignore'))
+
+    if 'link' in item.keys():
+        url=item['link'][0]['url']
+
+        for link in item['link']:
+            if link['type']=='sciencedirect':
+                url=link['url']
+
+
+    #psql table insertion
+    cursor.execute("""
+            INSERT INTO bib (         docid,
+                                      author,
+                                      title,
+                                      journal,
+                                      url)
+            VALUES (%s, %s, %s, %s, %s);""",
+            (docid,names,title,journal,url)
+    )
+
+connection.commit()
+
+#update the table with number of instances per journal name
+cursor.execute("""  WITH  query AS(SELECT journal, COUNT(journal)
+                                  FROM bib
+                                  GROUP BY journal)
+
+                    UPDATE bib
+                        SET journal_instances = query.count
+                        FROM query
+                        WHERE bib.journal = query.journal
+
+""")
+connection.commit()
+
+#close the connection
+connection.close()
+
diff --git a/udf/ext_age_check.py b/udf/ext_age_check.py
new file mode 100755
index 0000000..2217e10
--- /dev/null
+++ b/udf/ext_age_check.py
@@ -0,0 +1,254 @@
+#==============================================================================
+#CHECKING AGE AGREEMENT BETWEEN DISCOVERED AND MATCHED STRATIGRAPHIC PHRASE
+#==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf
+
+#==============================================================================
+# ACQUIRE RELEVANT MODULES and DATA
+#==============================================================================
+
+import time, random, psycopg2, urllib2, csv, yaml
+
+#tic
+start_time = time.time()
+
+#==============================================================================
+# DEFINE FUNCTION TO DOWNLOAD CSV
+#==============================================================================
+def download_csv( url ):
+    
+    #return variable
+    dump_dict = {}
+    
+    #get strat_names from Macrostrat API
+    dump = urllib2.urlopen( url )
+    dump = csv.reader(dump)
+    
+    #unpack downloaded CSV as list of tuples
+    #--> length of VARIABLE == number of fields
+    #--> length of VARIABLE[i] == number of rows
+    #--> VARIABLE[i][0] = header name
+    cols = list(zip(*dump))
+    
+    #key names correspond to field names (headers in the CSV file)
+    for field in cols:
+        dump_dict[field[0]]=field[1:]
+        
+    dump_dict['headers'] = sorted(dump_dict.keys())
+    
+    return dump_dict
+    
+
+#Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+
+#initalize the age_check table
+cursor.execute("""
+    DELETE FROM age_check;
+""")
+connection.commit()
+
+#initialize the age_agree column in strat_phrases
+cursor.execute(""" 
+        UPDATE  strat_phrases 
+        SET     age_agree = '-';
+""")
+connection.commit()
+
+
+#strat_phrases data dump
+cursor.execute("""
+    SELECT  DISTINCT ON(strat_name_id, int_name) 
+    
+            strat_phrase_root,
+            strat_flag,
+            strat_name_id,
+            int_name,
+            int_id 
+            
+            FROM strat_phrases
+            
+            WHERE strat_name_id<>'0' 
+            AND int_name<>'na'
+            
+""")
+    
+
+#convert list of tuples to list of lists
+int_list=cursor.fetchall()
+int_list = [list(elem) for elem in int_list]
+
+#gather list of unique strat_name_ids
+strat_name_ids = set([i[2] for i in int_list])
+
+#define overlap buffer between unit_ages and interval_ages
+age_buff=25
+
+#initialize summary variables
+huh=0
+yay=0
+nay=0
+int_check=[]
+
+#loop through all unique strat_name ids to check extracted age - unit link congruency
+for idx, name in enumerate(strat_name_ids):
+    
+    #find all strat_name_list - interval tuples to be checked
+    name_check = [j for j in int_list if j[2]==name]
+    
+    #split into individual strat_name_ids by user-defined deilimiter
+    strat_name_id = name.split('~')
+    
+    #initiliaze variables for checking
+    unit_ages=[]
+    skip=0
+    
+    #loop through each individual strat_name_id
+    for match in strat_name_id:
+        #hit the api to find unit_matches through /units route
+        unit_link = download_csv( 'https://macrostrat.org/api/units?format=csv&strat_name_id=' + match )
+        
+        #if matches found, define b_age and t_age for a given strat_name from the constituent units
+        if unit_link['headers']:        
+            unit_ages.append([max(float(x) for x in unit_link['b_age']), min(float(x) for x in unit_link['t_age'])])
+        else:
+            unit_ages.append('na')
+            skip+=1
+    
+    #loop through each individual strat_name_list - interval tuple    
+    for idx2,row in enumerate(name_check):
+        
+        #initiliaze variables for checking
+        age_check=[]
+        int_id=row[4]
+        age_def=row[3]        
+        int_age=[]
+
+        #case if interval information is a known interval name (AND at least one strat_name_id has a match)    
+        if int_id!=0 and len(strat_name_id)!=skip:
+            int_def   = download_csv( 'https://macrostrat.org/api/defs/intervals?format=csv&int_id=' + str(int_id) )
+            int_age =  [float(int_def['b_age'][0]), float(int_def['t_age'][0])]
+
+        #case if interval information is a numeric age (AND at least one strat_name_id has a match)
+        elif len(strat_name_id)!=skip:
+            age_def=age_def.split(' ')      
+
+            #if units are Gyr
+            if age_def[1].lower() in 'ga':
+                try:
+                    age=float(age_def[0])*1000
+                    int_age = [abs(age), abs(age)]
+                        
+                except ValueError:
+                    age='na'
+            
+            #if units are Myr
+            else:
+                try:
+                    age=float(age_def[0])
+                    int_age = [abs(age), abs(age)]
+                    
+                except:
+                    age='na'
+        
+        #compare each individual strat_name age range to the interval information
+        for unit in unit_ages:
+            
+            #case if unit or interval information not recovered
+            if unit=='na' or not int_age:
+                age_check.append('NA')
+                huh+=1
+                
+            #case if unit and interval ages do not cross
+            elif unit[0]+age_buff<int_age[1]-age_buff or unit[1]-age_buff>int_age[0]+age_buff:
+                age_check.append('no')
+                nay+=1
+                
+            #case if they do
+            else:
+                age_check.append('yes')
+                yay+=1
+        
+        #summarize the findings for all strat_name_ids
+        name_check[idx2].extend(['~'.join(age_check)])
+        
+        
+        
+    #dump to a local variable    
+    [int_check.append(j) for j in name_check]
+    
+#write to PSQL table
+for idx,i in enumerate(int_check):
+    strat_phrase_root, strat_flag, strat_name_id, int_name, int_id, age_agree = i
+    
+    cursor.execute(""" 
+        INSERT INTO age_check(        strat_phrase_root, 
+                                      strat_flag, 
+                                      strat_name_id, 
+                                      int_name, 
+                                      int_id, 
+                                      age_agree)
+        VALUES (%s, %s, %s, %s, %s, %s);""",
+        (strat_phrase_root, strat_flag, strat_name_id, int_name, int_id, age_agree)
+    )       
+
+
+#push insertions
+connection.commit()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE age_check;
+""")
+connection.commit()
+
+
+#splice strat_name_id-age tuples into the strat_phrases table
+cursor.execute(""" UPDATE strat_phrases
+
+                        SET age_agree = age_check.age_agree                        
+                        FROM age_check
+                        WHERE strat_phrases.strat_name_id = age_check.strat_name_id
+                        AND strat_phrases.int_name = age_check.int_name
+
+""")
+connection.commit()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE strat_phrases;
+""")
+connection.commit()
+
+#close the connection
+connection.close()
+ 
+    
+    
+#summary statistic    
+success = 'SUMMARY OF AGE CHECKS: yays = %s; nays = %s; unknown = %s' %(yay, nay, huh)
+
+#summary of performance time
+elapsed_time = time.time() - start_time
+print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time)
+
+    
+    
+    
+    
+    
diff --git a/udf/ext_references.py b/udf/ext_references.py
new file mode 100755
index 0000000..62e739a
--- /dev/null
+++ b/udf/ext_references.py
@@ -0,0 +1,219 @@
+#==============================================================================
+#DEFINE BEGINNING OF REFERENCES SECTION
+#==============================================================================
+
+#==============================================================================
+# ACQUIRE RELEVANT MODULES and DATA
+#==============================================================================
+
+import time, psycopg2, yaml
+import numpy as np
+
+from psycopg2.extensions import AsIs
+
+#tic
+start_time = time.time()
+
+#Credentials and configuration
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+
+#make some cursors for writing/reading from Postgres
+cursor = connection.cursor()
+doc_cursor=connection.cursor()
+sent_cursor = connection.cursor()
+
+
+#==============================================================================
+# FIND REFERENCE SECTIONS
+#==============================================================================
+
+#list of unique docids from target-strat tuples
+doc_cursor.execute("""
+    SELECT docid FROM strat_target
+    UNION
+    SELECT docid FROM strat_target_distant
+""")
+
+#initialize Numpy arrays
+refs=np.zeros(0,dtype={'names':['docid','sentid','type','depth'],'formats':['|S100','i4','|S100','f4']})
+best_refs=np.zeros(0,dtype={'names':['docid','sentid','type','depth'],'formats':['|S100','i4','|S100','f4']})
+
+#loop through documents list
+for idx, doc in enumerate(doc_cursor):
+    #array for reference section for this document
+    tmp_refs=np.zeros(0,dtype={'names':['docid','sentid','type','depth'],'formats':['|S100','i4','|S100','f4']})
+    
+    #collect all sentences for this document
+    sent_cursor.execute(""" 
+            SELECT docid, sentid, words from %(my_app)s_sentences_%(my_product)s
+                WHERE docid=%(my_docid)s;""",
+                {
+                  "my_app": AsIs(config['app_name']),
+                  "my_product": AsIs(config['product'].lower()),
+                  "my_docid": doc[0],
+                    })
+                    
+    #loop through sentences
+    for idx2, sent in enumerate(sent_cursor):
+        docid,sentid,words = sent
+        phrase = ' '.join(words)
+        
+        #REF ID LOGIC: is the first word in a sentence 'References'?
+        if words[0]=='References' or words[0]=='REFERENCES':
+            tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ref',0)],dtype=tmp_refs.dtype))
+        
+        #REF ID LOGIC: is the first word in a sentence 'Bibliography'?
+        if words[0]=='Bibliography' or words[0]=='BIBLIOGRAPHY':
+            tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ref',0)],dtype=tmp_refs.dtype))
+        
+        #REF ID LOGIC: is the first word in a sentence French for 'Bibliography'?
+        if words[0]=='Bibliographie' or words[0]=='BIBLIOGRAPHIE':
+            tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ref',0)],dtype=tmp_refs.dtype))
+
+        #REF ID LOGIC: is there an all capitalized 'REFERENCES' in words array?
+        if 'REFERENCES' in words:
+            tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ref_mention',0)],dtype=tmp_refs.dtype))
+            
+        #REF ID LOGIC: is the word 'Acknowledgements' in words array?
+        if 'Acknowledgements' in words or 'Acknowledgments' in words or 'ACKNOWLEDGEMENTS' in words or 'ACKNOWLEDGMENTS' in words:
+            tmp_refs = np.append(tmp_refs,np.array([(docid,sentid,'ack',0)],dtype=tmp_refs.dtype))
+
+    #null case where no reference section is identified
+    if len(tmp_refs)==0:
+        tmp_refs = np.array([(docid,0,'none',0)],dtype=tmp_refs.dtype)
+    
+    #parameter characterizing how deep the reference section is (ref sent #)/(total sent #)
+    tmp_refs['depth']=tmp_refs['sentid']/(idx2+1.)    
+    
+    #all potential reference breaks
+    refs = np.append(refs,tmp_refs)
+    
+    #'Best' reference break is the deepest sentid
+    tmp_refs=np.sort(tmp_refs,order='sentid')
+    best_refs = np.append(best_refs,tmp_refs[-1])
+
+
+#arbitrary cutoff for 'good' inferences - reset those below threshold to null case
+best_refs['sentid'][best_refs['depth']<0.1]=0
+best_refs['type'][best_refs['depth']<0.1]='none'
+best_refs['depth'][best_refs['depth']<0.1]=0.0
+
+zeros = best_refs[best_refs['sentid']==0]
+
+
+#==============================================================================
+# PUSH REFERENCE FINDINGS TO POSTGRES
+#==============================================================================
+
+#Make a new table
+cursor.execute("""
+    DROP TABLE IF EXISTS refs_location CASCADE;
+    CREATE TABLE refs_location(
+        docid text,
+        sentid int,
+        type text,
+        depth real);
+""")
+connection.commit()
+
+#loop through best reference ids and push to Postgres
+for row in best_refs:
+    cursor.execute("""
+    INSERT INTO refs_location(    docid,
+    				sentid,
+    				type,
+    				depth)
+    VALUES (%s, %s, %s, %s);""",
+    (row['docid'],str(row['sentid']),row['type'],str(row['depth']))
+    )
+     
+
+#Join reference locations to target-strat tuples
+cursor.execute(""" UPDATE strat_target
+                        SET refs_loc = refs_location.sentid
+                        FROM refs_location
+                        WHERE strat_target.docid = refs_location.docid
+
+""")
+
+#Join reference locations to target-strat_distant tuples
+cursor.execute(""" UPDATE strat_target_distant
+                        SET refs_loc = refs_location.sentid
+                        FROM refs_location
+                        WHERE strat_target_distant.docid = refs_location.docid
+""")
+
+#Add 'in references'/'out of references' inference to target-strat tuples
+cursor.execute(""" UPDATE strat_target
+                        SET in_ref = 'yes'
+                        WHERE sentid > refs_loc
+                        AND   refs_loc <>0
+
+""")
+
+#Add 'in references'/'out of references' inference to target-strat_distant tuples
+cursor.execute(""" UPDATE strat_target_distant
+                        SET in_ref = 'yes'
+                        WHERE sentid > refs_loc
+                        AND   refs_loc <>0
+
+""")
+
+#push changes
+connection.commit()
+
+#close the postgres connection
+connection.close()
+
+elapsed_time = time.time() - start_time
+
+
+#%% FOR DEBUGGING
+
+#tmp_refs=best_refs[(best_refs['sentid']!=0)]
+#
+#tmp = tmp_refs[np.random.choice(len(tmp_refs), 1)]
+#
+#my_sentid= np.arange(tmp['sentid']-4,tmp['sentid']+20)
+#
+#sent_cursor.execute(""" 
+#        SELECT docid, sentid, words from %(my_app)s_sentences_%(my_product)s
+#            WHERE docid=%(my_docid)s
+#            AND   sentid = ANY(%(my_sentid)s)
+#            ORDER BY sentid;""",
+#            {
+#              "my_app": AsIs(config['app_name']),
+#              "my_product": AsIs(config['product'].lower()),
+#              "my_docid": tmp['docid'][0],
+#              "my_sentid": (list(my_sentid),)
+#                })
+#
+#phrase=''                
+#for idx2, sent in enumerate(sent_cursor):
+#    docid,sentid,words = sent
+#    words = ' '.join(words)
+#    
+#    if sentid==tmp['sentid']:
+#        flag=words
+#        phrase = phrase+'\n*****  '+words
+#    else:
+#        phrase = phrase+'\n-'+words
+##    print words
+#    
+##    if sentid==tmp['sentid']:
+#        
+#        
+#print '\n ###########\n\n %s \n\n ###########\n\n %s \n\n ###########\n\n' %(phrase,flag)
+
diff --git a/udf/ext_results.py b/udf/ext_results.py
new file mode 100755
index 0000000..87e7ef4
--- /dev/null
+++ b/udf/ext_results.py
@@ -0,0 +1,170 @@
+#==============================================================================
+#GENERATE RESULTS TABLE
+#==============================================================================
+
+import time, random, re, yaml, psycopg2, copy
+from psycopg2.extensions import AsIs
+
+start_time = time.time()
+
+# Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+
+#NEW RESULTS TABLE
+cursor.execute("""
+    DROP TABLE IF EXISTS results CASCADE;
+    CREATE TABLE results(
+        target_id int,
+        docid text,
+        sentid int,
+        target_word text,
+        strat_phrase_root text,
+        strat_flag text,
+        strat_name_id  text,
+        age_sum text,
+        source text,
+        phrase text,
+        is_strat_name text DEFAULT 'yes',
+        in_ref text
+        );
+""")
+connection.commit()
+
+#TMP RESULTS TABLE
+cursor.execute("""
+    DROP TABLE IF EXISTS results_new;
+""")
+
+#push drop/create to the database
+connection.commit()
+
+#gather results from the same-sentence inferences
+cursor.execute(""" 
+    INSERT INTO results (target_id, docid, sentid, target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, phrase, in_ref) 
+		(SELECT target_id, docid, sentid,  target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, sentence, in_ref
+				FROM strat_target 
+				WHERE ((num_phrase=1 AND @(target_distance)<51) 
+				OR   (target_relation='parent' AND num_phrase <8 AND @(target_distance)<51)
+				OR   (target_relation='child'  AND num_phrase <8 AND @(target_distance)<51)))"""
+)
+  
+#push insertions
+connection.commit()
+
+#mark these inferences as coming from same sentence
+cursor.execute("""
+    UPDATE results SET source='in_sent' WHERE source IS NULL 
+   """
+)
+
+#push update
+connection.commit()
+
+#gather results from the near-sentence inferences
+cursor.execute(""" 
+    INSERT INTO results (target_id, docid, sentid, target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, phrase, in_ref) 
+		(SELECT target_id, docid, sentid,  target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, words_between, in_ref
+				FROM strat_target_distant 
+				WHERE num_phrase=1)"""
+)
+  
+#push insertions
+connection.commit()
+
+#mark these inferences as coming from near sentence
+cursor.execute("""
+    UPDATE results SET source='out_sent' WHERE source IS NULL 
+   """
+)
+
+#remove non-unique rows
+cursor.execute("""
+    CREATE TABLE results_new AS (SELECT DISTINCT * FROM results)
+   """
+)
+
+
+#adopt tmp results table
+cursor.execute("""
+    DROP TABLE results
+   """
+)
+
+cursor.execute("""
+    ALTER TABLE results_new RENAME TO results;
+   """
+)
+
+
+#add serial primary key
+cursor.execute("""
+    ALTER TABLE results ADD COLUMN result_id serial PRIMARY KEY;
+   """
+)
+
+#push updates
+connection.commit()
+
+#list of known and troublesome ligatures
+weird_strings = [['\xef\xac\x82', 'fl'], ['\xef\xac\x81', 'fi']]
+
+
+#IMPORT THE RESULTS - SIMPLE CHECK FOR STRAT NAME MENTION VALIDITY 
+cursor_main = connection.cursor()
+cursor_main.execute(""" SELECT * FROM results WHERE strat_flag = 'mention'; """)
+
+test=[]
+
+for line in cursor_main:
+    #collect individual elements from the results dump
+    target_id, docid, sentid, target_word, strat_phrase_root,strat_flag,strat_name_id, age_sum, source, phrase, mention_check, in_ref, result_id = line
+    checked=[]
+    
+    #ligature replacement
+    for ws in weird_strings:
+        if ws[0] in phrase:
+            phrase=phrase.replace(ws[0],ws[1])
+    
+    #find all mentions of strat_phrase_root
+    matches=[m.start() for m in re.finditer(strat_phrase_root,phrase)]
+    
+    #loop through matches
+    for m in matches:
+        #lets look at the word that follows the potential strat name
+        tocheck = phrase[m+len(strat_phrase_root)+1:]
+        tocheck=tocheck.split(' ')
+        
+        #capitalized word following strat name mention invalidates it. Exceptions include:
+            #1) end of sentence  2) Series  3) parantheses
+        if tocheck[0].lower()!=tocheck[0] and tocheck[0]!='Series' and tocheck[0][0]!='.' and tocheck[0]!='-LRB-' and tocheck[0]!='-RRB-':        
+            checked.append('no')
+        else:
+            checked.append('yes')
+    
+    #update post gres table
+    if 'yes' not in checked:
+        cursor.execute("""
+            UPDATE results SET is_strat_name = %s WHERE result_id = %s;""",
+            ('no',result_id)
+           )
+        
+#push update
+connection.commit()
+
+#close the postgres connection
+connection.close()
+
diff --git a/udf/ext_strat_mentions.py b/udf/ext_strat_mentions.py
new file mode 100755
index 0000000..1eae13e
--- /dev/null
+++ b/udf/ext_strat_mentions.py
@@ -0,0 +1,264 @@
+##==============================================================================
+## LOOK FOR STRATIGRAPHIC NOMENCLATURE  - MENTION RECOGINITION
+##==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf
+
+#==============================================================================
+# ACQUIRE RELEVANT MODULES
+#==============================================================================
+import time, urllib2, csv, random, psycopg2, re, yaml
+from psycopg2.extensions import AsIs
+
+#tic
+start_time = time.time()
+
+#function for dowloading CSVs from a URL
+def download_csv( url ):
+    
+    #return variable
+    dump_dict = {}
+    
+    #get strat_names from Macrostrat API
+    dump = urllib2.urlopen( url )
+    dump = csv.reader(dump)
+    
+    #unpack downloaded CSV as list of tuples
+    #--> length of VARIABLE == number of fields
+    #--> length of VARIABLE[i] == number of rows
+    #--> VARIABLE[i][0] = header name
+    cols = list(zip(*dump))
+    
+    #key names correspond to field names (headers in the CSV file)
+    for field in cols:
+        dump_dict[field[0]]=field[1:]
+        
+    dump_dict['headers'] = sorted(dump_dict.keys())
+    
+    return dump_dict
+
+#==============================================================================
+# CONNECT TO POSTGRES
+#==============================================================================
+
+# Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+    
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+#initialize mentions
+cursor.execute("""DELETE FROM strat_phrases WHERE strat_flag='mention';
+""")
+
+#import sentences to mine - just restricted to sentences with target instance
+cursor.execute("""
+    SELECT  DISTINCT ON (target_instances.docid,
+            target_instances.sentid)
+            
+            target_instances.docid,
+            target_instances.sentid,
+            %(my_app)s_sentences_%(my_product)s.words
+   FROM     %(my_app)s_sentences_%(my_product)s, target_instances
+   WHERE    %(my_app)s_sentences_%(my_product)s.docid = target_instances.docid
+   AND      %(my_app)s_sentences_%(my_product)s.sentid = target_instances.sentid;
+""",{
+    "my_app": AsIs(config['app_name']),
+    "my_product": AsIs(config['product'].lower())
+})
+sentences=cursor.fetchall()
+
+#convert list of tuples to list of lists
+sentences = [list(elem) for elem in sentences]
+
+#import docid - strat_name tuples
+cursor.execute("""
+    SELECT  * FROM strat_dict;
+""")
+connection.commit()
+
+strat_dict = cursor.fetchall()
+
+#convert list of tuples to list of lists
+strat_dict = [list(elem) for elem in strat_dict]
+
+#make a dictionary of docid-strat_name tuples
+doc_list={}
+for i in strat_dict:
+    doc_list[i[0]]=set(i[1])
+    
+#==============================================================================
+# DEFINE STRATIGRPAHIC VARIABLES
+#==============================================================================
+
+#get interval_names from Macrostrat API
+int_dict   = download_csv( 'https://macrostrat.org/api/defs/intervals?all&format=csv' )
+
+#user-defined variables
+with open('./var/strat_variables.txt') as fid:
+    strat_variables = fid.readlines()
+    
+for i in strat_variables:
+    exec i
+
+#PRE-PROCESS: hack to replace weird strings
+for idx,line in enumerate(sentences):
+    for ws in weird_strings:
+        if ws[0] in ' '.join(sentences[idx][2]):
+            sentences[idx][2]=[word.replace(ws[0],ws[1]) for word in sentences[idx][2]] 
+
+
+#with a dictionary of stratigraphic entites mapped to a given document, find the mentions
+# i.e. find 'the Bitter Springs stromatolite' after identifying 'the Bitter Springs Formation'
+strat_flag = 'mention'
+age_agree='-'
+
+strat_list=[]
+
+#loop through documents with discoverd stratigraphic entities
+for idx1,doc in enumerate(doc_list.keys()):
+    #list of sentences data from a given document
+    target_sents = [k for k in sentences if k[0]==doc]
+    #list of stratigraphic names associated with that document
+    target_strat = list(doc_list[doc])
+    
+    
+    #loop through sentence data per document
+    for idx2,line in enumerate(target_sents):
+        doc_id, sent_id, words = line
+                
+        sentence = ' '.join(words)
+    
+        for name in target_strat:
+            #parse the (strat_name, strat_name_id) tuple
+            strat_phrase=name.split(DICT_DELIM)[0]
+            strat_phrase=strat_phrase.split(' ')
+            strat_phrase=' '.join(strat_phrase[0:-1])            
+            
+            strat_name_id=name.split(DICT_DELIM)[1]
+            
+            matches=[m.start() for m in re.finditer(r'\b' + strat_phrase + r'\b',sentence)]
+            
+            if matches:
+                #if at least one match is found, count number of spaces backward to arrive at word index
+                name_idx = [sentence[0:m].count(' ') for m in matches]
+                #remove double hits (i.e. stromatolitic-thrombolitic)
+                name_idx = list(set(name_idx))
+                #split the strat mention into parts
+                name_part = strat_phrase.split(' ')
+           
+                #loop through all discoveries                
+                for i in name_idx:
+                    #record it as a mention if:
+                    #   1) it is not at the end of the sentence
+                    #   2) the phrase is not followed by a strat_flag
+                    #       (this is to avoid duplication)
+                    #   3) the mention is not part of garbled table e.g. 'Tumbiana Tumbiana Tumbiana Tumbiana'
+                    if i<len(words)-len(name_part) and words[i+len(name_part)] not in strat_flags and words[i] != words[i+1]:                    
+                        int_name='na'
+                        int_id='0'
+                        
+                        #look to see if there is an interval name before the mention
+                        if i>1 and words[i-1] in int_dict['name']:
+                            #record this interval name
+                            int_name=words[i-1]
+                            #list comprehensions to record interval id
+                            locations = [k for k, t in enumerate(int_dict['name']) if t==int_name]
+                            int_id = [int_dict['int_id'][I] for I in locations]
+                            int_id=int_id[0]                        
+                        
+                        #look to see if there is an age_flag before the mention
+                        elif i>1 and words[i-1] in age_flags:
+                            #record age flag with its preceding word (most likely a number)
+                            int_name = words[i-2] + ' ' + words[i-1]
+                        
+                        #record where mention is found
+                        max_word_id = str(i+len(name_part))
+                        min_word_id = str(i)
+                        
+                        #add to local variable
+                        strat_list.append('\t'.join(str(x) for x in [idx2, doc_id, sent_id,name.split(DICT_DELIM)[0], strat_phrase,strat_flag, min_word_id, max_word_id, strat_name_id,int_name,int_id, sentence]))
+                        
+                        #write to PSQL table
+                        cursor.execute(""" 
+                            INSERT INTO strat_phrases(    docid,
+                                                          sentid,
+                                                          strat_phrase,
+                                                          strat_phrase_root,
+                                                          strat_flag,
+                                                          phrase_start,
+                                                          phrase_end,
+                                                          strat_name_id,
+                                                          int_name,
+                                                          int_id,
+                                                          sentence,
+                                                          age_agree)
+                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""",
+                            (doc_id, sent_id,name.split(DICT_DELIM)[0], strat_phrase, strat_flag, min_word_id, max_word_id, strat_name_id,int_name,int_id, sentence, age_agree)
+                            )           
+
+#push insertions to the database
+connection.commit()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE strat_phrases;
+""")
+connection.commit()
+
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE target_instances;
+""")
+connection.commit()
+     
+
+#summarize the number of DISTINCT strat_name_roots found in a given sentence
+cursor.execute("""  WITH  query AS(SELECT docid, sentid,
+                                  COUNT(DISTINCT strat_phrase_root) AS count
+                                  FROM strat_phrases
+                                  GROUP BY docid,sentid)
+                            
+                    UPDATE strat_phrases
+                        SET num_phrase = query.count
+                        FROM query
+                        WHERE strat_phrases.docid = query.docid
+                        AND strat_phrases.sentid = query.sentid
+
+""")
+connection.commit()
+
+#summarize the number of DISTINCT strat_name_roots found for a given document
+cursor.execute("""  WITH  query AS(SELECT docid,
+                                  COUNT(DISTINCT strat_phrase_root) AS count
+                                  FROM strat_phrases
+                                  GROUP BY docid)
+                            
+                    UPDATE target_instances
+                        SET num_strat_doc = query.count
+                        FROM query
+                        WHERE target_instances.docid = query.docid
+""")
+connection.commit()      
+
+#close the postgres connection
+connection.close()
+
+#summary statistic    
+success = 'number of stratigraphic mentions : %s' %len(strat_list)
+
+#summary of performance time
+elapsed_time = time.time() - start_time
+print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time)
+
+#print out random result
+r=random.randint(0,len(strat_list)-1); show = "\n".join(str(x) for x in strat_list[r].split('\t')); print "=========================\n" + show + "\n========================="
+
diff --git a/udf/ext_strat_phrases.py b/udf/ext_strat_phrases.py
new file mode 100755
index 0000000..1257151
--- /dev/null
+++ b/udf/ext_strat_phrases.py
@@ -0,0 +1,336 @@
+#==============================================================================
+#STRATIGRTAPHIC NAME EXTRACTOR
+# ENTITIES = CAPITALIZED WORDS PRECEDING A STRATIGRAPHIC FLAG
+# MENTIONS = DEFINED ENTITIES MINUS THE STRATIGRAPHIC FLAG
+#
+# ENTITY MAPPING DONE ON THE FULL SENTENCES TABLE
+# MENTIONS DEFINED BY ENTITIES FOUND IN A GIVEN DOCUMENT
+# MENTION MAPPIG DONE ON SENTENCES WITH A TARGET INSTANCE
+#==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf
+
+#==============================================================================
+# ACQUIRE RELEVANT MODULES
+#==============================================================================
+import time, urllib2, csv, random, psycopg2, re, string, yaml
+from stop_words import get_stop_words
+from psycopg2.extensions import AsIs
+
+#tic
+start_time = time.time()
+
+#==============================================================================
+# DEFINE FUNCTION TO DOWNLOAD CSV
+#==============================================================================
+def download_csv( url ):
+
+    #return variable
+    dump_dict = {}
+
+    #get strat_names from Macrostrat API
+    dump = urllib2.urlopen( url )
+    dump = csv.reader(dump)
+
+    #unpack downloaded CSV as list of tuples
+    #--> length of VARIABLE == number of fields
+    #--> length of VARIABLE[i] == number of rows
+    #--> VARIABLE[i][0] = header name
+    cols = list(zip(*dump))
+
+    #key names correspond to field names (headers in the CSV file)
+    for field in cols:
+        dump_dict[field[0]]=field[1:]
+
+    dump_dict['headers'] = sorted(dump_dict.keys())
+
+    return dump_dict
+
+#==============================================================================
+# CONNECT TO POSTGRES
+#==============================================================================
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+#initalize the strat_phrases table
+cursor.execute("""
+    DELETE FROM strat_phrases;
+""")
+
+#IMPORT THE SENTENCES DUMP
+cursor.execute("""
+    SELECT docid, sentid, words FROM %(my_app)s_sentences_%(my_product)s;
+""", {
+    "my_app": AsIs(config['app_name']),
+    "my_product": AsIs(config['product'].lower())
+})
+#sentences=cursor.fetchall()
+
+#convert list of tuples to list of lists
+#sentences = [list(elem) for elem in sentences]
+
+#push drop/create to the database
+connection.commit()
+
+#==============================================================================
+# DEFINE STRATIGRPAHIC VARIABLES
+#==============================================================================
+
+#get strat_names from Macrostrat API
+strat_dict = download_csv( 'https://macrostrat.org/api/defs/strat_names?all&format=csv' )
+
+#get interval_names from Macrostrat API
+int_dict   = download_csv( 'https://macrostrat.org/api/defs/intervals?all&format=csv' )
+
+#stop words
+stop_words = get_stop_words('english')
+stop_words = [i.encode('ascii','ignore') for i in stop_words]
+alpha = list(string.ascii_lowercase);
+alpha_period = [i+'.' for i in alpha]
+stop_words = stop_words + ['lower','upper','research'] + alpha + alpha_period
+
+#STRATIGRAPHIC VARIABLE DEFINITIONS
+with open('./var/strat_variables.txt') as fid:
+    strat_variables = fid.readlines()
+
+for i in strat_variables:
+    exec i
+
+#==============================================================================
+# LOOK FOR STRATIGRAPHIC NOMENCLATURE  - ENTITY RECOGNITION
+#==============================================================================
+
+#PRE-PROCESS: hack to replace weird strings
+changed_docs=[];
+
+#initialize the list of found names and list of documents
+strat_list=[]
+doc_list={}
+to_write = []
+
+#loop through sentences
+for idx,line in enumerate(cursor):
+    line = list(line)
+    for ws in weird_strings:
+        if ws[0] in ' '.join(line[2]):
+            changed_docs.append([line[0], line[1], ws[0], ws[1]])
+            line[2]=[word.replace(ws[0],ws[1]) for word in line[2]]
+    line = tuple(line)
+
+    #collect individual elements from the psql sentences dump
+    doc_id, sent_id, words = line
+
+    #initialize the variables needed to analyze words in sentence
+    i = 0
+    complete_phrase = []
+
+    for word in words:
+        i += 1
+
+        #initial assumption is a found strat name will have no age information and no link to Macrostrat
+        int_name="na"
+        int_id='0'
+        strat_name_id = '0'
+
+        #initialize the lists of word indices and stratigraphic phrase words
+        indices=[]
+        strat_phrase = []
+
+        #logic triggered by discovery of 'stratigraphic' flag (i.e. Formation, etc.)
+        if word in strat_flags:
+            #record the found word and its index
+            indices.append(i)
+            this_word = words[i-1]
+
+            #initialize variables needed for analysis of preceding words
+            preceding_words=[]
+            j = 2
+
+            #loop to identify preceding stratigraphic modifiers on GOOD_WORD (e.g. Wonoka Formation)
+            #loop continues if:
+            #   1) the beginning of sentence is not reached
+            #   2) the preceding string is not empty
+            #   3) the preceding word is not the current word
+            #   4) the preceding word is capitalized
+            #   5) the preceding capitalized word is not a stratigraphic flag (e.g. Member Wonoka Formation)
+            #   6) the preceding word is not a capitalized stop word
+            #   7) the preceding word does not contain a number
+            while (i-j)>(-1) and len(words[i-j])!=0 and words[i-j] != words[i-j+1] and words[i-j][0].isupper() and words[i-j] not in strat_flags and words[i-j].lower() not in stop_words and re.findall(r'\d+',  words[i-j])==[]:
+                #loop also broken if preceding word is an interval name (e.g. Ediacaran Wonoka Formation)
+                if words[i-j] in int_dict['name']:
+                    #record this interval name
+                    int_name=words[i-j]
+
+                    #list comprehensions to record interval id
+                    locations = [k for k, t in enumerate(int_dict['name']) if t==int_name]
+                    int_id = [int_dict['int_id'][I] for I in locations]
+                    int_id=int_id[0]
+                    break
+
+                #loop also broken if preceding word is an age flag (i.e. 580 Ma. Wonoka Formation)
+                elif words[i-j] in age_flags:
+                    #record age flag with its preceding word (most likely a number)
+                    int_name = words[i-j-1] + ' ' + words[i-j]
+                    break
+
+                #record qualifying preceding words and their indices
+                preceding_words.append(words[i-j])
+                indices.append((i-j))
+                j += 1
+
+            #if qualifying preceding words found, join them to the stratigraphic flag and create a stratigraphic phrase
+            if preceding_words and len(preceding_words)<4:
+                #create a full and partial stratigraphic phrase (i.e. with and without the stratigraphic flag)
+                preceding_words.reverse()
+                strat_phrase = ' '.join(preceding_words) + ' ' + this_word
+                strat_phrase_cut = ' '.join(preceding_words)
+                strat_flag=this_word
+
+                #define term to check against Macrostrat's definitions
+                # i.e.  Bitter Springs for Bitter Springs Formation
+                #      Manlius Limestone for Manlius Limestone
+                if strat_flag in lith_flags:
+                    strat_phrase_check = strat_phrase
+                else:
+                    strat_phrase_check = strat_phrase_cut
+
+                #index stratigraphic name to Macrostrat (if present)
+                if strat_phrase_check in strat_dict['strat_name']:
+                    #list comprehensions to record strat name id (all string matches regardless of inferred rank)
+                    locations = [k for k, t in enumerate(strat_dict['strat_name']) if t==strat_phrase_check]
+                    loc_ids = [strat_dict['strat_name_id'][L] for L in locations]
+                    if loc_ids:
+                        strat_name_id = '~'.join(str(e) for e in loc_ids)
+
+                #beginning and end of stratigraphic phrase
+                max_word_id = max(indices)
+                min_word_id = min(indices)
+
+                #create list of stratigraphic phrases found in a given sentence
+                complete_phrase.append((idx, strat_phrase, strat_phrase_cut,strat_flag, doc_id, sent_id, max_word_id, min_word_id, strat_name_id,int_name,int_id, ' '.join(words)))
+
+    #once sentence has been mined, add finds to growing list of stratigraphic names
+    for idx,strat_phrase,strat_phrase_cut,strat_flag, doc_id, sent_id, max_word_id, min_word_id, strat_name_id,int_name,int_id, sentence in complete_phrase:
+
+        #dump to local variable
+        strat_list.append('\t'.join([str(x) for x in [idx, doc_id, sent_id, strat_phrase,strat_phrase_cut, strat_flag, min_word_id, max_word_id, strat_name_id,int_name,int_id, sentence]]))
+
+        #make dictionary of (strat name, strat_name_id), separated by user defined delimiet, per doc id
+        if doc_id in doc_list.keys():
+            doc_list[doc_id].add(strat_phrase+DICT_DELIM+strat_name_id)
+        else:
+            doc_list[doc_id]=set([strat_phrase+DICT_DELIM+strat_name_id])
+
+        to_write.append((doc_id, sent_id, strat_phrase,strat_phrase_cut, strat_flag, min_word_id, max_word_id, strat_name_id,int_name,int_id, sentence))
+
+#write to PSQL table
+cursor.executemany("""
+    INSERT INTO strat_phrases(    docid,
+                                  sentid,
+                                  strat_phrase,
+                                  strat_phrase_root,
+                                  strat_flag,
+                                  phrase_start,
+                                  phrase_end,
+                                  strat_name_id,
+                                  int_name,
+                                  int_id,
+                                  sentence)
+    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""", to_write)
+
+
+#push insertions
+connection.commit()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE strat_phrases;
+""")
+connection.commit()
+
+#initalize the strat_dict table
+cursor.execute("""
+    DELETE FROM strat_dict;
+""")
+
+#write stratigraphic names found in documents to a PSQL table
+for idx1,doc in enumerate(doc_list.keys()):
+    strat_doc = list(doc_list[doc])
+    cursor.execute("""
+            INSERT INTO strat_dict(    docid,
+                                       strat_phrase)
+            VALUES (%s, %s);""",
+            (doc, strat_doc)
+        )
+
+connection.commit()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE strat_dict;
+""")
+connection.commit()
+
+#close the postgres connection
+connection.close()
+
+#summary statistic
+success = 'number of stratigraphic entities : %s' %len(strat_list)
+
+#summary of performance time
+elapsed_time = time.time() - start_time
+print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time)
+
+#print out random result
+r=random.randint(0,len(strat_list)-1); show = "\n".join(str(x) for x in strat_list[r].split('\t')); print "=========================\n" + show + "\n========================="
+
+
+#%% OLD CODE
+##IMPORT SENTENCES TO MINE
+#fid = open('/Users/jhusson/local/bin/deepdive-0.7.1/app/stromatolites/tutorial/input/strat_locations.tsv','r')
+#test = fid.readlines()
+#fid.close()
+
+##SPLIT LINE INTO TAB SEPARATED COMPONENTS
+#elem = line.split('\t')
+
+##WRITE DATA TO A FILE
+#fid = open('/Users/jhusson/local/bin/deepdive-0.7.1/app/stromatolites/tutorial/input/strat_phrases.tsv','w')
+#for item in strat_list:
+#  fid.write("%s\n" % item)
+#fid.close()
+
+##USEFUL BIT OF CODE FOR LOOKING AT RANDOM SENTENCES
+#r=random.randint(0,len(strat_locations)); elem=strat_locations[r].split('\t'); elem[4].replace("~^~"," ")
+
+##USEFUL BIT OF CODE FOR LOOKING AT RANDOM RESULTS
+#r=random.randint(0,len(strat_list)-1); show = "\n".join(str(x) for x in strat_list[r].split('\t')); show=show.replace(ARR_DELIM,' '); print "=========================\n" + show + "\n========================="
+
+
+##USEFUL BIT OF CODE FOR LOOKING AT ALL RESULTS
+#for item in strat_list:
+#    show = "\n".join(str(x) for x in item.split('\t'))
+#    print "=========================\n" + show + "\n========================="
+#
+#cursor.execute(""" SELECT * from sentences where doc_id='54b43272e138239d8685117b' and sent_id=352 """)
+#dump=cursor.fetchall()
+#
+#cursor.execute(""" SELECT * from sentences where doc_id='54b43289e138239d868552b2' and sent_id=421 """)
+#dump=cursor.fetchall()
+
+
+
+
+
diff --git a/udf/ext_strat_target.py b/udf/ext_strat_target.py
new file mode 100755
index 0000000..7a9a12e
--- /dev/null
+++ b/udf/ext_strat_target.py
@@ -0,0 +1,269 @@
+#==============================================================================
+#DEFINE RELATIONSHIP BETWEEN TARGET ENTITIES AND STRATIGRAPHIC PHRASES
+#==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf
+
+#==============================================================================
+# ACQUIRE RELEVANT MODULES and DATA
+#==============================================================================
+
+import time, random, psycopg2, yaml
+from psycopg2.extensions import AsIs
+
+#tic
+start_time = time.time()
+
+# Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+#initalize the strat_target relationship table
+cursor.execute("""
+    DELETE FROM strat_target;
+""")
+connection.commit()
+
+#strat_phrases data dump
+cursor.execute("""
+    SELECT  DISTINCT ON (strat_phrases.docid,
+            strat_phrases.sentid,
+            strat_phrase,
+            phrase_start,
+            phrase_end)
+            
+            strat_phrases.docid,
+            strat_phrases.sentid,
+            strat_phrase_root,
+            strat_flag,
+            strat_name_id,
+            phrase_start,
+            phrase_end,
+            int_name,
+            num_phrase,
+            strat_phrases.sentence,
+            strat_phrases.age_agree
+            
+    FROM    strat_phrases, target_instances
+    WHERE   strat_phrases.docid=target_instances.docid
+    AND     strat_phrases.sentid=target_instances.sentid
+""")
+    
+
+#convert list of tuples to list of lists
+strat_list=cursor.fetchall()
+strat_list = [list(elem) for elem in strat_list]  
+
+#target_instances data dump
+cursor.execute("""
+    SELECT  target_instances.docid,
+            target_instances.sentid,
+            target_word,
+            target_word_idx,
+            target_pose,
+            target_path,
+            target_parent,
+            target_children,
+            %(my_app)s_sentences_%(my_product)s.words,
+            target_id
+    FROM    target_instances, %(my_app)s_sentences_%(my_product)s
+    WHERE   target_instances.docid=%(my_app)s_sentences_%(my_product)s.docid
+    AND     target_instances.sentid=%(my_app)s_sentences_%(my_product)s.sentid;""", 
+    {
+    "my_app": AsIs(config['app_name']),
+    "my_product": AsIs(config['product'].lower())
+})
+
+#convert list of tuples to list of lists
+target_instances=cursor.fetchall()
+target_instances = [list(elem) for elem in target_instances]
+
+#==============================================================================
+# DEFINING RELATIONSHIP BETWEEN STRATIGRAPHY ENTITY/MENTION AND TARGET
+#==============================================================================
+
+strat_target_list=[]
+
+#loop through all sentences with strat entities/mentions
+for idx, line in enumerate(strat_list):
+    doc_id, sent_id, strat_phrase_root, strat_flag,strat_name_id,phrase_start,phrase_end,int_name,num_phrase,sentence,age_agree = line
+    
+    #grab the target instances for that same sentence    
+    target=[s for k, s in enumerate(target_instances) if s[0]==doc_id and s[1]==sent_id]
+    
+    #loop through all target instances in that sentence
+    for idx2,elem in enumerate(target):
+        doc_id, sent_id, target_word,target_word_idx,target_pose,target_path,target_parent,target_children,words, target_id = elem
+   
+        #is the stratigraphic entity/mention a PARENT or CHILD of the target instance?
+        if list(set(target_parent) & set(range(phrase_start,phrase_end)))!=[]:
+            target_relation='parent'
+        elif list(set(sum(eval(target_children), [])) & set(range(phrase_start,phrase_end)))!=[]:
+            target_relation='child'
+        else:
+            target_relation='na'
+
+        #what is the word DISTANCE between the strat mention/entity and the target instance?
+        target_distance=[max(target_word_idx)-i for i in range(phrase_start,phrase_end)]
+        target_distance=target_distance+[min(target_word_idx)-i for i in range(phrase_start,phrase_end)] 
+        
+        # target found WITHIN the strat phrase (e.g. Upper Stromatolitic Carbonate Member)
+        if sum(n > 0 for n in target_distance)!=0 and sum(n < 0 for n in target_distance)!=0:
+            target_distance=0
+        #target found BEHIND the strat phrase
+        elif sum(n > 0 for n in target_distance)==0:
+            target_distance = max(target_distance)
+        #target found AHEAD of the strat_phrase
+        else:
+            target_distance = min(target_distance)
+       
+        #grab the bag of words
+        if target_distance>1:
+            words_between = words[phrase_end:phrase_end+(target_distance)]
+        elif target_distance<-1:
+            words_between = words[phrase_start+(target_distance):phrase_start]
+        else:
+            words_between='{}'
+        
+        #dump to local variable
+        strat_target_list.append([doc_id, sent_id, strat_phrase_root,num_phrase,
+                                 target_relation,target_distance,sentence,
+                                 strat_flag,phrase_start,phrase_end,int_name,
+                                 words_between,target_word,target_word_idx])        
+        #write to PSQL table
+        cursor.execute(""" 
+            INSERT INTO strat_target(     docid,
+                                          sentid,
+                                          target_word,
+                                          target_word_idx,
+                                          strat_phrase_root,
+                                          strat_flag,
+                                          strat_name_id,
+                                          strat_start,
+                                          strat_end,
+                                          int_name,
+                                          num_phrase,
+                                          target_relation,
+                                          target_distance,
+                                          words_between,
+                                          sentence,
+                                          age_agree,
+                                          target_id)
+                        VALUES (%s, %s, %s, %s, %s, 
+                                %s, %s, %s, %s, %s,
+                                %s, %s, %s, %s, %s, %s, %s);""",
+                                
+                                (doc_id, sent_id, target_word,
+                                 target_word_idx, strat_phrase_root, strat_flag,
+                                 strat_name_id,phrase_start,phrase_end,
+                                 int_name,num_phrase,target_relation,
+                                 target_distance,words_between,sentence,age_agree, target_id)
+            )
+            
+connection.commit()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE strat_target;
+""")
+connection.commit()
+
+#==============================================================================
+# PROVIDE SUMMARIES FOR AGE-AGREEMENT BETWEEN STRAT_PHRASE AND MACROSTRAT STRAT_NAME
+#==============================================================================
+
+#initialize the age_agree column in strat_phrases
+cursor.execute(""" 
+        UPDATE  strat_target 
+        SET     age_sum = '-';
+""")
+connection.commit()
+
+#gather distinct Macrostrat links
+cursor.execute("""
+    SELECT DISTINCT (strat_name_id) FROM strat_target;
+""")
+
+#convert list of tuples to list of lists
+tocheck=cursor.fetchall()
+tocheck = [list(elem) for elem in tocheck]
+
+#find all instances of strat_name_id occuring in the age_check table
+cursor.execute("""
+    WITH  query AS(SELECT DISTINCT (strat_name_id) FROM strat_target)
+               
+               SELECT strat_phrases.strat_name_id, strat_phrases.age_agree FROM strat_phrases,query
+               		WHERE strat_phrases.strat_name_id=query.strat_name_id
+               		AND   strat_phrases.age_agree<>'-';
+    """,
+)
+
+#convert list of tuples to list of lists    
+results=cursor.fetchall()
+results = [list(elem) for elem in results]
+
+#loop through all strat_name_ids and summarize age agreement discoveries
+for idx,name in enumerate(tocheck):
+    tmp = [i for i in results if i[0]==name[0]]        
+    ids = name[0].split('~')
+
+    #initialize the age agreement list    
+    counts = [[0] * 2 for i in range(len(ids))]
+
+    #loop through all comparisons between a strat_name_id string and interval information
+    for idx2,item in enumerate(tmp):        
+        #consider each strat_name in the strat_name_string
+        ans = item[1].split('~')
+
+        #record whether its an allowable or disallowable match        
+        for idx3,data in enumerate(ans):
+            if data=='yes':
+                counts[idx3][0]+=1
+            elif data=='no':
+                counts[idx3][1]+=1
+    
+    #record the age agreement summary                             
+    tocheck[idx].extend([counts])
+    
+    #variables to push to PSQL database
+    strat_name_id=name[0]
+    str_counts=str(counts)
+    
+    #write to PSQL table
+    cursor.execute(""" 
+            UPDATE  strat_target
+            SET     age_sum = %s
+            WHERE   strat_name_id = %s;""",
+            
+            (str_counts, strat_name_id)
+            )
+            
+connection.commit()
+
+
+#summary statistic    
+success = 'number of strat-target tuples : %s' %len(strat_target_list)
+
+#summary of performance time
+elapsed_time = time.time() - start_time
+print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time)
+
+
+#show a random result
+r=random.randint(0,len(strat_target_list)-1); show = "\n".join(str(x) for x in strat_target_list[r][0:7]); print "=========================\n" + show +  "\n========================="
+        
+#close the postgres connection
+connection.close()
+    
diff --git a/udf/ext_strat_target_distant.py b/udf/ext_strat_target_distant.py
new file mode 100755
index 0000000..0093159
--- /dev/null
+++ b/udf/ext_strat_target_distant.py
@@ -0,0 +1,326 @@
+#==============================================================================
+#DEFINE RELATIONSHIP BETWEEN TARGET ENTITIES AND DISTANT STRATIGRAPHIC PHRASES
+#==============================================================================
+
+#==============================================================================
+# ACQUIRE RELEVANT MODULES and DATA
+#==============================================================================
+
+import time, random, psycopg2, yaml
+from psycopg2.extensions import AsIs
+
+#tic
+start_time = time.time()
+
+# Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+
+cursor = connection.cursor()
+
+doc_cursor=connection.cursor()
+target_cursor=connection.cursor()
+strat_cursor = connection.cursor()
+sent_cursor = connection.cursor()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE target_instances;
+""")
+connection.commit()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE strat_phrases;
+""")
+connection.commit()
+
+#some sort of magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE %(my_app)s_sentences_%(my_product)s;
+""", {
+    "my_app": AsIs(config['app_name']),
+    "my_product": AsIs(config['product'].lower())
+})
+connection.commit()
+
+#==============================================================================
+# FIND STRATIGRAPHIC PHRASES NEAREST TO ORPHAN TARGET INSTANCES
+#==============================================================================
+
+#how many sentences back from orphan to look for stratigraphic phrases
+strat_distance=3
+
+#initialize the dump variable
+strat_target_distant=[]
+
+#list of docids with orphaned targets
+doc_cursor.execute("""
+    SELECT  DISTINCT ON (target_instances.docid)
+
+            target_instances.docid
+    FROM    target_instances, %(my_app)s_sentences_%(my_product)s 
+    WHERE   target_instances.target_id 
+            NOT IN (select strat_target.target_id from strat_target) 
+    AND     num_strat_doc<>0
+    AND     target_instances.docid=%(my_app)s_sentences_%(my_product)s.docid
+    AND     target_instances.sentid=%(my_app)s_sentences_%(my_product)s.sentid
+    ORDER BY target_instances.docid ASC, target_instances.sentid ASC
+""", {
+    "my_app": AsIs(config['app_name']),
+    "my_product": AsIs(config['product'].lower())
+})
+
+
+#initalize the strat_target_distant relationship table
+cursor.execute("""
+    DELETE FROM strat_target_distant;
+""")
+connection.commit()
+
+#loop through document list
+for idx,doc in enumerate(doc_cursor):    
+    #orphaned targets from a given document
+    target_cursor.execute("""
+    SELECT  DISTINCT ON (target_instances.docid,
+            target_instances.sentid,
+            target_instances.target_word_idx)
+    
+            target_instances.docid,
+            target_instances.sentid,
+            target_word,
+            target_word_idx,
+            target_parent,
+            target_children,
+            %(my_app)s_sentences_%(my_product)s.words,
+            target_id
+    FROM    target_instances, %(my_app)s_sentences_%(my_product)s 
+    WHERE   target_instances.target_id 
+            NOT IN (select strat_target.target_id from strat_target) 
+    AND     target_instances.docid=%(my_docid)s 
+    AND     target_instances.docid=%(my_app)s_sentences_%(my_product)s.docid
+    AND     target_instances.sentid=%(my_app)s_sentences_%(my_product)s.sentid
+    ORDER BY target_instances.docid ASC, target_instances.sentid ASC
+""", {
+    "my_app": AsIs(config['app_name']),
+    "my_product": AsIs(config['product'].lower()),
+    "my_docid": doc[0]
+    })
+
+    #convert list of tuples to list of lists
+    tmp_target=target_cursor.fetchall()
+    tmp_target = [list(elem) for elem in tmp_target]
+    
+    #define the sentences where those instances come from
+    sentids = [item[1] for item in tmp_target]
+
+    #gather all stratigraphic phrases from docid that occur before the deepest orphan
+    sent_query = max(sentids)
+    
+    #strat_phrases from document that precede the orphan deepest into the document
+    strat_cursor.execute(""" 
+        SELECT DISTINCT ON (docid, sentid, strat_phrase_root,strat_name_id)
+                docid, sentid, strat_phrase_root, strat_flag, num_phrase, strat_name_id,int_name,age_agree from strat_phrases
+                WHERE docid=%s
+                AND sentid<%s
+             ORDER BY sentid ASC;""",
+             (doc[0], sent_query)
+             )
+
+    #convert list of tuples to list of lists
+    tmp_strat=strat_cursor.fetchall()
+    tmp_strat = [list(elem) for elem in tmp_strat]
+    
+    #loop through the list of orphans
+    for idx2,target in enumerate(tmp_target):
+        #define set of variables from this particular orphan
+        target_sent=target[1]
+        target_word=target[2]
+        parent = target[4]        
+        children = list(sum(eval(target[5]), []))
+        words = target[6]
+        target_id=target[7]
+    
+        #find all stratigraphic phrases that occur before this orphan and within the defined buffer
+        strat_find = [item[1] for item in tmp_strat if target_sent-item[1]<=strat_distance and target_sent-item[1]>0]
+        
+        #if candidate strat_phrase(s) are found
+        if strat_find:
+            #selet the closest sentence with phrase(s)
+            strat_find=max(strat_find)
+            #collect all the strat_phrase(s) in that sentence
+            strat_info = [item for item in tmp_strat if item[1]==strat_find]
+            
+            #define the sentids for sentences that bridge the strat_phrase(s) to the orphan
+            sent_inbetween=range(strat_find,target[1]+1)
+            
+            #collect the words between strat_phrases and orphaned target
+            sent_cursor.execute(""" 
+                        SELECT docid, sentid, words from %(my_app)s_sentences_%(my_product)s
+                            WHERE docid=%(my_docid)s
+                            AND   sentid=ANY(%(my_sentid)s)
+                            ORDER BY sentid ASC;""",
+                            {
+                              "my_app": AsIs(config['app_name']),
+                              "my_product": AsIs(config['product'].lower()),
+                              "my_docid": doc[0],
+                              "my_sentid": sent_inbetween
+                                }
+                                )
+        
+            #convert list of tuples to list of lists
+            words_between=sent_cursor.fetchall()
+            words_between = [list(elem) for elem in words_between]
+            words_between = [' '.join(item[2]) for item in words_between]
+            words_between = ''.join(words_between)
+            
+            #define the distance between orphan and strat_phrase(s) sentence
+            target_distance = target[1]-strat_find
+            
+            #define grammatical parent and children (as words) of the orphan
+            parent = [words[i] for i in parent]
+            children = [words[i] for i in children]
+           
+            #loop through all the strat_phrases found in the nearest host sentence
+            for match in strat_info:
+                #info about the strat_phrase
+                [docid, sentid, strat_phrase_root, 
+                strat_flag, num_phrase, strat_name_id, 
+                int_name, age_agree] = match
+               
+                toadd=[docid, sentid, strat_phrase_root, 
+                       strat_flag, num_phrase, strat_name_id, 
+                       int_name, age_agree, target_distance,
+                       target_id,target_word,parent,children,
+                       words_between]
+                #dump to local variable                
+                strat_target_distant.append(toadd)
+                #write to psql table
+                cursor.execute(""" 
+                    INSERT INTO strat_target_distant( docid, 
+                                                     sentid, 
+                                                     strat_phrase_root,
+                                                     strat_flag, 
+                                                     num_phrase, 
+                                                     strat_name_id, 
+                                                     int_name, 
+                                                     age_agree,
+                                                     target_sent_dist,
+                                                     target_id,
+                                                     target_word,
+                                                     target_parent,
+                                                     target_children,
+                                                     words_between)
+                                VALUES (%s, %s, %s, %s, %s, 
+                                        %s, %s, %s, %s, %s,
+                                        %s, %s, %s, %s);""",
+                                        
+                                        (docid, sentid, strat_phrase_root, 
+                                         strat_flag, num_phrase, strat_name_id, 
+                                         int_name, age_agree, target_distance,
+                                         target_id,target_word,parent,children,
+                                         words_between)
+                                         )
+    
+#push the insertions
+connection.commit()
+
+
+#==============================================================================
+# PROVIDE SUMMARIES FOR AGE-AGREEMENT BETWEEN STRAT_PHRASE AND MACROSTRAT STRAT_NAME
+#==============================================================================
+
+#initialize the age_agree column in strat_phrases
+cursor.execute(""" 
+        UPDATE  strat_target_distant 
+        SET     age_sum = '-';
+""")
+connection.commit()
+
+#gather distinct Macrostrat links
+cursor.execute("""
+    SELECT DISTINCT (strat_name_id) FROM strat_target_distant;
+""")
+
+#convert list of tuples to list of lists
+tocheck=cursor.fetchall()
+tocheck = [list(elem) for elem in tocheck]
+
+#find all instances of strat_name_id occuring in the age_check table
+cursor.execute("""
+    WITH  query AS(SELECT DISTINCT (strat_name_id) FROM strat_target_distant)
+               
+               SELECT strat_phrases.strat_name_id, strat_phrases.age_agree FROM strat_phrases,query
+               		WHERE strat_phrases.strat_name_id=query.strat_name_id
+               		AND   strat_phrases.age_agree<>'-';
+    """,
+)
+
+#convert list of tuples to list of lists    
+results=cursor.fetchall()
+results = [list(elem) for elem in results]
+
+#loop through all strat_name_ids and summarize age agreement discoveries
+for idx,name in enumerate(tocheck):
+    tmp = [i for i in results if i[0]==name[0]]        
+    ids = name[0].split('~')
+
+    #initialize the age agreement list    
+    counts = [[0] * 2 for i in range(len(ids))]
+
+    #loop through all comparisons between a strat_name_id string and interval information
+    for idx2,item in enumerate(tmp):        
+        #consider each strat_name in the strat_name_string
+        ans = item[1].split('~')
+
+        #record whether its an allowable or disallowable match        
+        for idx3,data in enumerate(ans):
+            if data=='yes':
+                counts[idx3][0]+=1
+            elif data=='no':
+                counts[idx3][1]+=1
+    
+    #record the age agreement summary                             
+    tocheck[idx].extend([counts])
+    
+    #variables to push to PSQL database
+    strat_name_id=name[0]
+    str_counts=str(counts)
+    
+    #write to PSQL table
+    cursor.execute(""" 
+            UPDATE  strat_target_distant
+            SET     age_sum = %s
+            WHERE   strat_name_id = %s;""",
+            
+            (str_counts, strat_name_id)
+            )
+            
+connection.commit()
+
+
+#summary statistic    
+success = 'number of strat-distant target tuples : %s' %len(strat_target_distant)
+
+#toc
+elapsed_time = time.time() - start_time
+print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time)
+
+
+#show a random result
+r=random.randint(0,len(strat_target_distant)-1); show = "\n".join(str(x) for x in strat_target_distant[r]); print "=========================\n" + show +  "\n========================="
+        
+#close the postgres connection
+connection.close()
+    
diff --git a/udf/ext_target.py b/udf/ext_target.py
new file mode 100755
index 0000000..152c7c2
--- /dev/null
+++ b/udf/ext_target.py
@@ -0,0 +1,166 @@
+#==============================================================================
+#TARGET NAME EXTRACTOR
+#==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf
+
+#==============================================================================
+# import relevant modules and data
+#==============================================================================
+import time, random, re, yaml, psycopg2
+from psycopg2.extensions import AsIs
+
+start_time = time.time()
+
+# Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+#initalize the target_instances table
+cursor.execute("""
+    DELETE FROM target_instances;
+""")
+
+#IMPORT THE SENTENCES DUMP
+cursor.execute("""
+    SELECT docid, sentid, words, poses, dep_paths, dep_parents FROM %(my_app)s_sentences_%(my_product)s;
+""", {
+    "my_app": AsIs(config['app_name']),
+    "my_product": AsIs(config['product'].lower())
+})
+
+#push drop/create to the database
+connection.commit()
+
+
+#initalize list of target occurences
+target_list=[]
+
+#TARGET DEFINITIONS
+with open('./var/target_variables.txt') as fid:
+    target_variables = fid.readlines()
+
+for i in target_variables:
+    exec i
+
+#loop through all sentences.
+to_write = []
+for line in cursor:
+    #collect individual elements from the psql sentences dump
+    docid, sentid, words, poses, dep_paths, dep_parents = line
+
+    #initialize list of local target occurences
+    targets = []
+
+    #sentence string
+    sent = ' '.join(words)
+
+    #loop through all the target names
+    for name in target_names:
+	#starting index of all matches for a target_name in the joined sentence
+	matches=[m.start() for m in re.finditer(name,sent.lower())]
+
+	if matches:
+	    #if at least one match is found, count number of spaces backward to arrive at word index
+	    indices = [sent[0:m].count(' ') for m in matches]
+	    #remove double hits (i.e. stromatolitic-thrombolitic)
+	    indices = list(set(indices))
+	    #target_name spans its starting word index to the number of words in the phrase
+	    target_word_idx = [[i,i+len(name.split(' '))] for i in indices]
+
+	    #initialize other data about a found target_name
+	    target_pose=[]
+	    target_path=[]
+	    target_parent=[]
+
+	    for span in target_word_idx:
+		#poses, paths and parents can be found at same indices of a target_name find
+		target_word = ' '.join(words[span[0]:span[1]])
+
+		if target_word.lower() not in bad_words:
+		    target_children=[]
+		    target_pose = poses[span[0]:span[1]]
+		    target_path = dep_paths[span[0]:span[1]]
+		    target_parent = dep_parents[span[0]:span[1]]
+
+		    #children of each component of a target_name
+		    for span_idx in range(span[0], span[1]):
+			children = [j for j,i in enumerate(dep_parents) if i==span_idx+1]
+			target_children.append(children)
+
+		    #convert parent_ids to Pythonic ids
+		    target_parent = [i-1 for i in target_parent]
+
+		    #add finds to a local variable
+		    target_list.append([docid, sentid, target_word, span, target_pose, target_path, target_parent, target_children, sent])
+
+		    #for easier storage, convert list of target_children lists to a string
+		    str_target_children = str(target_children)
+
+		    #write to PSQL table
+                    to_write.append(
+			(docid, sentid, target_word, span, target_pose, target_path, target_parent, str_target_children, sent)
+			)
+
+cursor.executemany("""
+INSERT INTO target_instances(    docid,
+				sentid,
+				target_word,
+				target_word_idx,
+				target_pose,
+				target_path,
+				target_parent,
+				target_children,
+				sentence)
+VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);""",
+to_write
+)
+
+#push insertions to the database
+connection.commit()
+
+#restart the primary key
+cursor.execute("""
+    ALTER TABLE target_instances DROP target_id;
+""")
+
+#push drop/create to the database
+connection.commit()
+
+#add primary key
+cursor.execute(""" ALTER TABLE target_instances ADD COLUMN target_id SERIAL PRIMARY KEY;
+""")
+connection.commit()
+
+
+#do some magic
+connection.set_isolation_level(0)
+cursor.execute("""  VACUUM ANALYZE target_instances;
+""")
+connection.commit()
+
+#close the connection
+connection.close()
+
+#summary statistic
+success = 'number of target instances: %s' %len(target_list)
+
+#summary of performance time
+elapsed_time = time.time() - start_time
+print '\n ###########\n\n %s \n elapsed time: %d seconds\n\n ###########\n\n' %(success,elapsed_time)
+
+
+#USEFUL BIT OF CODE FOR LOOKING AT RANDOM RESULTS
+r=random.randint(0,len(target_list)-1); print "=========================\n"; print("\n".join(str(target) for target in target_list[r])); print  "\n========================="
diff --git a/udf/ext_target_adjective.py b/udf/ext_target_adjective.py
new file mode 100755
index 0000000..0133ddc
--- /dev/null
+++ b/udf/ext_target_adjective.py
@@ -0,0 +1,100 @@
+#==============================================================================
+#TARGET ADJECTIVE EXTRACTOR
+#==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf
+
+#==============================================================================
+# import relevant modules and data
+#==============================================================================
+import time, random, re, yaml, psycopg2
+from psycopg2.extensions import AsIs
+
+start_time = time.time()
+
+# Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+#IMPORT TARGETS WITH DEPENDENTS
+cursor.execute("""
+    SELECT docid, sentid, target_id, target_word, target_children
+    
+    FROM target_instances
+    WHERE target_children<>'[[]]'; 
+""")
+
+target=cursor.fetchall()
+
+
+#IMPORT THE SENTENCES DUMP
+cursor.execute("""
+    WITH temp as (
+            SELECT DISTINCT ON (docid, sentid) docid, sentid
+		     FROM target_instances
+            WHERE target_children<>'[[]]'     
+    )
+
+
+    SELECT s.docid, s.sentid, words, poses
+    FROM %(my_app)s_sentences_%(my_product)s AS s
+
+	JOIN temp ON temp.docid=s.docid AND temp.sentid=s.sentid; 
+    """, {
+    "my_app": AsIs(config['app_name']),
+    "my_product": AsIs(config['product'].lower())
+    })
+
+sentences=cursor.fetchall()
+
+#initalize the target_instances table
+cursor.execute("""
+    DELETE FROM target_adjectives;
+""")
+
+#push drop/create to the database
+connection.commit()
+
+
+adj=[]
+for idx,line in enumerate(target):
+    docid, sentid, target_id, target_word, target_children = line
+    target_children = eval(target_children)
+    target_children =target_children[0]
+    
+    sent = [elem for elem in sentences if elem[0]==docid and elem[1]==sentid]
+    
+    for c in target_children:
+        if sent[0][3][c]=='JJ':
+            adj.append([docid, sentid, target_id, target_word, sent[0][2][c]])
+            
+            #write to PSQL table
+            cursor.execute(""" 
+                INSERT INTO target_adjectives(   docid,
+                                                sentid,
+                                                target_id,
+                                                target_word,
+                                                target_adjective)
+                VALUES (%s, %s, %s, %s, %s);""",
+                (docid, sentid, target_id, target_word, sent[0][2][c])
+            )
+        if c<0:
+            print 'something is up!'
+
+#push insertions to the database
+connection.commit()
+            
+#close the connection
+connection.close()
diff --git a/udf/initdb.py b/udf/initdb.py
new file mode 100755
index 0000000..d723f1c
--- /dev/null
+++ b/udf/initdb.py
@@ -0,0 +1,194 @@
+#==============================================================================
+#INITIALIZE POSTGRES TABLES
+#==============================================================================
+
+#path: /Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/udf
+
+#==============================================================================
+
+import yaml
+import psycopg2
+from psycopg2.extensions import AsIs
+
+# Connect to Postgres
+with open('./credentials', 'r') as credential_yaml:
+    credentials = yaml.load(credential_yaml)
+
+with open('./config', 'r') as config_yaml:
+    config = yaml.load(config_yaml)
+
+# Connect to Postgres
+connection = psycopg2.connect(
+    dbname=credentials['postgres']['database'],
+    user=credentials['postgres']['user'],
+    password=credentials['postgres']['password'],
+    host=credentials['postgres']['host'],
+    port=credentials['postgres']['port'])
+cursor = connection.cursor()
+
+#SENTENCES TABLE
+#DROP TABLE IF EXISTS sentences CASCADE;
+#CREATE TABLE sentences (docid text, sentid integer, wordidx integer[], words text[], poses text[], ners text[], lemmas text[], dep_paths text[], dep_parents integer[]);
+#COPY sentences FROM '/Users/jhusson/local/bin/deepdive-0.7.1/deepdive-apps/stromatolites/input/strom_nlp352';
+
+
+#TARGET_INSTANCES    
+cursor.execute("""
+    DROP TABLE IF EXISTS target_instances CASCADE;
+    CREATE TABLE target_instances(
+        target_id serial PRIMARY KEY,
+        docid text,
+        sentid int,
+        target_word text,
+        num_strat_doc int DEFAULT 0,
+        target_word_idx int[],
+        target_pose text[],
+        target_path text[],
+        target_parent int[],
+        target_children text,
+        sentence text);
+""")
+connection.commit()
+
+#TARGET_ADJECTIVES    
+cursor.execute("""
+    DROP TABLE IF EXISTS target_adjectives CASCADE;
+    CREATE TABLE target_adjectives(
+        docid text,
+        sentid int,
+        target_id int,
+        target_word text,
+        target_adjective text);
+""")
+connection.commit()
+
+#STRAT_PHRASES
+cursor.execute("""
+    DROP TABLE IF EXISTS strat_phrases CASCADE;
+    CREATE TABLE strat_phrases(
+        docid text,
+        sentid int,
+        strat_phrase text,
+        strat_phrase_root text,
+        num_phrase int,
+        sentence text,
+        strat_flag text,
+        phrase_start int,
+        phrase_end int,
+        strat_name_id text,
+        int_name text,
+        int_id int,
+        age_agree text DEFAULT '-');
+""")
+connection.commit()
+
+#STRAT_DICT
+cursor.execute("""
+    DROP TABLE IF EXISTS strat_dict CASCADE;
+    CREATE TABLE strat_dict(
+        docid text,
+        strat_phrase text[]);
+""")
+connection.commit()
+
+
+#STRAT_TARGET
+cursor.execute("""
+    DROP TABLE IF EXISTS strat_target CASCADE;
+    CREATE TABLE strat_target(
+        docid text,
+        sentid int,
+        refs_loc int,
+        in_ref text DEFAULT 'no',
+        strat_phrase_root text,
+        num_phrase int,
+        target_relation text,
+        target_distance int,
+        sentence text,
+        strat_flag text,
+        strat_name_id text,
+        strat_start int,
+        strat_end int,
+        int_name text,
+        age_agree text DEFAULT '-',
+        age_sum text DEFAULT '-',
+        words_between text[],
+        target_word text,
+        target_word_idx int[],
+        target_id int
+        );
+""")
+connection.commit()
+
+#AGE CHECK
+cursor.execute("""
+    DROP TABLE IF EXISTS age_check CASCADE;
+    CREATE TABLE age_check(
+        strat_phrase_root text,
+        strat_flag text,
+        strat_name_id text,
+        int_name text,
+        int_id int,
+        age_agree text);
+""")
+connection.commit()
+
+#STRAT_TARGET_DISTANT
+cursor.execute("""
+    DROP TABLE IF EXISTS strat_target_distant CASCADE;
+    CREATE TABLE strat_target_distant(
+        docid text,
+        sentid int,
+        refs_loc int,
+        in_ref text DEFAULT 'no',
+        strat_phrase_root text,
+        strat_flag text,
+        num_phrase int,
+        int_name text,
+        strat_name_id text,
+        age_agree text DEFAULT '-',
+        age_sum text DEFAULT '-',
+        words_between text,
+        target_sent_dist int, 
+        target_word text,
+        target_parent text [],
+        target_children text [],
+        target_id int
+        );
+""")
+connection.commit()
+
+
+#BIB
+cursor.execute("""
+    DROP TABLE IF EXISTS bib CASCADE;
+    CREATE TABLE bib(
+        docid text,
+        author text[],
+        title text,
+        journal text,
+        url text,
+        journal_instances int
+        );
+""")
+connection.commit()
+
+#RESULTS
+cursor.execute("""
+    DROP TABLE IF EXISTS results CASCADE;
+    CREATE TABLE results(
+        target_id int,
+        docid text,
+        sentid int,
+        target_word text,
+        strat_phrase_root text,
+        strat_name_id  text,
+        age_sum text,
+        source text,
+        phrase text
+        );
+""")
+connection.commit()
+
+# Disconnect from Postgres
+connection.close()
diff --git a/var/strat_variables.txt b/var/strat_variables.txt
new file mode 100755
index 0000000..30994c2
--- /dev/null
+++ b/var/strat_variables.txt
@@ -0,0 +1,19 @@
+#==============================================================================
+# DEFINE STRATIGRPAHIC VARIABLES
+#==============================================================================
+
+#delimiter to separate strat_entities from strat_name_ids in strat_dict
+DICT_DELIM='$$$'
+
+#words indicating stratigraphic names
+strat_flags = ["Group", "Formation", "Member", "Supergroup", "Bed", "Subgroup","Gp.", "Fm.", "Mbr.", "SGp.", "Gp", "Fm", "Mbr", "SGp"]
+    
+lith_flags = ["Dolomite","Dolostone","Limestone","Sandstone","Shale","Conglomerate","Chert"]
+
+strat_flags = strat_flags+lith_flags
+              
+#words indicating an age
+age_flags = ["Ma.", "Ga.", "Myr.","Ma", "Ga", "Myr"]
+
+#list of known and troublesome ligatures
+weird_strings = [['\xef\xac\x82', 'fl'], ['\xef\xac\x81', 'fi']]
diff --git a/var/target_variables.txt b/var/target_variables.txt
new file mode 100755
index 0000000..82ed5fd
--- /dev/null
+++ b/var/target_variables.txt
@@ -0,0 +1,11 @@
+#==============================================================================
+# DEFINE TARGET VARIABLES
+#==============================================================================
+
+#each string in this list will define a regular expression search
+#   EXAMPLE:    [r'\b' + ooid + r'\b', r'\b' + ooids + r'\b']
+#               will find all instances of 'ooid' or 'ooids' bound by a non-alphanumeric character
+target_names = ['stromatol', 'thrombol']
+
+#an optional list of false hits
+bad_words = ['non-stromatolitic','nonstromatolitic','non-stromatolite']
\ No newline at end of file