Skip to content

Commit

Permalink
initial code commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Mikhail Khodak committed May 17, 2018
1 parent b971925 commit 7e341ab
Show file tree
Hide file tree
Showing 200 changed files with 254,786 additions and 0 deletions.
180 changes: 180 additions & 0 deletions compute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import sys
from collections import Counter
from collections import defaultdict
from itertools import chain
import nltk
import numpy as np
from scipy import sparse as sp
from sklearn.linear_model import LinearRegression as LR


FLOAT = np.float32
INT = np.int32


def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None):
'''sliding window around n-grams in a document
Args:
strdoc: list of tokens (as strings)
intdoc: list of indices (as ints); len(intdoc) == len(strdoc)
vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys)
n: n in n-gram
wndo2: half the window size
unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams
Returns:
(n-gram, int generator) generator over (n-gram, context window pairs)
'''

wndo2pn = wndo2+n
unk = not unkgram is None
for i, ngram in enumerate(nltk.ngrams(strdoc, n)):
if ngram in vocabulary:
yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
elif unk:
yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])


def counts2mat(featcoocs, featlist, shape, dtype):
'''computes matrix from feature-word cooccurrence counts
Args:
featcoocs: dict mapping features to Counters
featlist: list of features
shape: matrix shape
dtype: dtype of matrix
Returns:
sparse matrix in CSR format
'''

rows, cols, values = zip(*((i, j, count) for i, feat in enumerate(featlist) for j, count in featcoocs[feat].items()))
return sp.coo_matrix((values, (rows, cols)), shape=shape, dtype=dtype).tocsr()


def cooc_matrix(corpus, featlist, wordlist, doc2wnd=ngram_context, unk=None, overlap=False, avg=False, wei=False, interval=1000000, verbose=False, comm=None, **kwargs):
'''constructs feature, word cooccurrence matrix
Args:
corpus: iterable of lists of strings
featlist: list of hashable features
wordlist: list of strings
doc2wnd: takes list of tokens, list of indices, and set of features and returns a (feature, index iterable) generator
unk: map words not in wordlist to this token (must be in wordlist); if None excludes OOV words
overlap: if True subtracts feature count from cooccurrence of feature with any word it contains; features must be iterable
avg: uses average over window size rather than cooccurrence counts
wei: weight co-occurring words by distance from window
interval: number of documents between conversion to sparse matrix
verbose: write context matrix construction progress
comm: MPI Communicator; outputs are None for non-root processes
kwargs: passed to doc2wnd
Returns:
cooccurrence matrix in CSR format, vector of feature counts, vector of word counts
'''

assert not (overlap and (avg or wei)), "correcting for overlap not compatible with averaging or weighting"

featset = set(featlist)
featcounts = Counter()
F = len(featlist)
unki = -1 if unk is None else wordlist.index(unk)
word2index = {word: i for i, word in enumerate(wordlist)}
wordcounts = Counter()
V = len(wordlist)

rank, size = (0, 1) if comm is None else (comm.rank, comm.size)
write = lambda msg: sys.stdout.write(msg) and sys.stdout.flush()
dtype = FLOAT if (avg or wei) else INT
if not rank:
matrix = sp.csr_matrix((F, V), dtype=dtype)
featcoocs = defaultdict(lambda: Counter())

for i, doc in enumerate(corpus):
if i%size == rank:
indices = [word2index.get(word, unki) for word in doc]
wordcounts.update(indices)
if avg:
for feat, window in doc2wnd(doc, indices, featset, **kwargs):
window = list(window)
if window:
increment = 1.0/len(window)
cooccounts = featcoocs[feat]
for index in window:
cooccounts[index] += increment
featcounts[feat] += 1
elif wei:
for feat, window in doc2wnd(doc, indices, featset, **kwargs):
window = list(window)
if window:
length = len(window)
half = int(length/2)
recip = 1.0/length
cooccounts = featcoocs[feat]
for j, index in enumerate(window[:half]):
cooccounts[index] += recip/(half-j)
for j, index in enumerate(window[half:]):
cooccounts[index] += recip/(j+1)
featcounts[feat] += 1
else:
for feat, window in doc2wnd(doc, indices, featset, **kwargs):
featcoocs[feat].update(window)
featcounts[feat] += 1
if not (i+1)%interval:
if rank:
comm.send(counts2mat(featcoocs, featlist, (F, V), dtype), dest=0)
else:
matrix += sum((comm.recv(source=j) for j in range(1, size)), counts2mat(featcoocs, featlist, (F, V), dtype))
if verbose:
write('\rProcessed '+str(i+1)+' Documents; Sparsity: '+str(matrix.nnz)+'/'+str(F*V)+'; Coverage: '+str((matrix.sum(1)>0).sum())+'/'+str(F))
featcoocs = defaultdict(lambda: Counter())

if size > 1:
featcounts = comm.reduce(featcounts, root=0)
wordcounts = comm.reduce(wordcounts, root=0)
if rank:
comm.send(counts2mat(featcoocs, featlist, (F, V), dtype), dest=0)
return 3*[None]
matrix += sum((comm.recv(source=j) for j in range(1, size)), counts2mat(featcoocs, featlist, (F, V), dtype))

if overlap:
for feat, coocs in featcoocs.items():
count = featcounts[feat]
for word in feat:
index = word2index.get(word)
if not index is None:
coocs[index] -= count
if verbose:
write('\rProcessed '+str(i+1)+' Documents; Sparsity: '+str(matrix.nnz)+'/'+str(F*V)+'; Coverage: '+str((matrix.sum(1)>0).sum())+'/'+str(F)+'\n')
return matrix, np.array([featcounts[feat] for feat in featlist], dtype=INT), np.array([wordcounts[word2index[word]] for word in wordlist], dtype=INT)


def symmetric_cooc_matrix(corpus, wordlist, unk=None, **kwargs):
'''constructs symmetric word, word cooccurrence matrix
Args:
corpus: iterable of lists of strings
wordlist: list of strings
unk: map words not in wordlist to this token (must be in wordlist); if None excludes OOV words
kwargs: passed to cooc_matrix
Returns:
cooccurrence matrix in CSR format, vector of word counts
'''

unkgram = None if unk is None else (unk,)
return cooc_matrix(corpus, [(word,) for word in wordlist], wordlist, unk=unk, n=1, unkgram=unkgram, **kwargs)[:2]


def linear_transform(cooc_matrix, word_embeddings, word_counts, Regression=LR, weights=None, **kwargs):
'''learns linear transform from context vectors to original embeddings
Args:
cooc_matrix: cooccurrence matrix of size (V, V)
word_embeddings: embedding matrix of size (V, d)
word_counts: word count vector of length V
Regression: regression class (from sklearn.linear_model)
weights: sample weight vector of length V; ignored if None
kwargs: passed to Regression
Returns:
fitted Regression object
'''

select = word_counts > 0
if not weights is None:
select *= weights > 0
weights = weights[select]

return Regression(**kwargs).fit(cooc_matrix[select].dot(word_embeddings) / word_counts[select,None], word_embeddings[select], weights)
142 changes: 142 additions & 0 deletions cooc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# computes (feature, word) cooccurrence matrices given text corpora
# to run use command:
# mpirun -n 8 python ALaCarte/cooc.py $FEATURE $CORPUSFILE $VOCABFILE $OUTPUTROOT $N
# where:
# FEATURE is one of ngram, synset, word
# CORPUSFILE is the corpus textfile (skipped if $FEATURE==synset)
# VOCABFILE is the word vocabulary
# OUTPUTROOT is the output name (script will make files $OUTPUTROOT.npz and $OUTPUTROOT.pkl)
# N is a positive integer (skipped if not $FEATURE==ngram)
# if $FEATURE==ngram: computes cooccurrences for all n-grams in sst,sst_fine,imdb,mr,cr,subj,mpqa,trec,mrpc,sick tasks
# if $FEATURE==synset: computes cooccurrences for all synsets in SemCor

import pickle
import sys
from itertools import chain
import nltk
import numpy as np
from nltk.corpus import semcor
from scipy import sparse as sp
from ALaCarte.compute import *
from text_embedding.documents import *
from text_embedding.features import *


UNK = '<unk>'


def ngram_vocab(n):
ngrams = lambda docs: {ngram for doc in tokenize(doc.lower() for doc in docs) for ngram in nltk.ngrams(doc, n)}
return sorted(set.union(*(ngrams(sst_fine(partition)[0]) for partition in ['train', 'test'])))
vocabulary = set.union(*(ngrams(task()[0]) for task in TASKMAP['cross-validation'].values()))
for task in TASKMAP['train-test split'].values():
for partition in ['train', 'test']:
try:
vocabulary = vocabulary.union(ngrams(task(partition)[0]))
except FileNotFoundError:
pass
return sorted(vocabulary)


def ntokens(tokens):
return len([split_on_punctuation(' '.join(tokens))])


def synset_context(sents):
def context(strdoc, intdoc, vocabulary, wndo2=None, unkfeat=None):
unk = not unkfeat is None
wndo2 = len(intdoc) if wndo2 is None else wndo2
offset = 0
for chunk in next(sents):
if type(chunk) == list:
length = ntokens(chunk)
else:
label = chunk.label()
if type(label) == str:
length = ntokens(chunk)
else:
length = ntokens(chunk[0])
synset = label.synset()
if synset in vocabulary:
yield synset, chain(intdoc[offset-wndo2:offset], intdoc[offset+length:offset+length+wndo2])
elif unk:
yield unkfeat, chain(intdoc[offset-wndo2:offset], intdoc[offset+length:offset+length+wndo2])
offset += length
return context


def synset_vocab():
return sorted({label.synset() for label in (chunk.label() for chunk in semcor.tagged_chunks(tag='sem') if not type(chunk) == list) if not type(label) == str})


def alacache(nameroot, feature='ngram'):
''' function to return output of this script
Args:
nameroot: root of files (without extensions); the input argument 'outputroot'
feature: string name of feature that was computed
Returns:
if file is for word x word cooccurrence: returns cooc matrix, word vocab, word counts; otherwise also returns feature vocab and featurecounts
'''

matrix = sp.load_npz(nameroot+'.npz')
with open(nameroot+'.pkl', 'rb') as f:
data = pickle.load(f)
if len(data) == 2:
return matrix, data['words'], data['counts']
return matrix, data['words'], data['wordcounts'], data[feature+'s'], data[feature+'counts']


if __name__ == '__main__':

feature = sys.argv[1]
if feature == 'ngram':
corpusfile, vocabfile, outputroot, n = sys.argv[2:6]
n = int(n)
elif feature == 'synset':
vocabfile, outputroot = sys.argv[2:4]
elif feature == 'word':
feature = ''
corpusfile, vocabfile, outputroot = sys.argv[2:5]
else:
raise(NotImplementedError)

with open(vocabfile, 'r') as f:
vocab = [line.split(' ')[0] for line in f]
vocab.append(UNK)

try:
from mpi4py import MPI
comm = MPI.COMM_WORLD
except ImportError:
comm = None

if feature:

if feature == 'ngram':
featurevocab = ngram_vocab(n)
with open(corpusfile, 'r') as f:
matrix, featurecounts, wordcounts = cooc_matrix((line.split() for line in f), featurevocab, vocab, n=n, unk=UNK, verbose=True, comm=comm)
elif feature == 'synset':
featurevocab = synset_vocab()
matrix, featurecounts, wordcounts = cooc_matrix(semcor.sents(), featurevocab, vocab, doc2wnd=synset_context(iter(semcor.tagged_sents(tag='sem'))), unk=UNK, interval=100, verbose=True, wndo2=None)
featurevocab = [synset.name() for synset in featurevocab]
else:
raise(NotImplementedError)

if comm is None or not comm.rank:
sp.save_npz(outputroot+'.npz', matrix)
with open(outputroot+'.pkl', 'wb') as f:
pickle.dump({'words': vocab, feature+'s': featurevocab, 'wordcounts': wordcounts, feature+'counts': featurecounts}, f)
else:
sys.exit()

else:

with open(corpusfile, 'r') as f:
matrix, counts = symmetric_cooc_matrix((line.split() for line in f), vocab, unk=UNK, verbose=True, comm=comm)
if comm is None or not comm.rank:
sp.save_npz(outputroot+'.npz', matrix)
with open(outputroot+'.pkl', 'wb') as f:
pickle.dump({'words': vocab, 'counts': counts}, f)
else:
sys.exit()
4 changes: 4 additions & 0 deletions data-SemEval2013_Task12/test/.directory
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[Dolphin]
Timestamp=2014,4,23,16,37,56
Version=3
ViewMode=1
Loading

0 comments on commit 7e341ab

Please sign in to comment.