-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Mikhail Khodak
committed
May 17, 2018
1 parent
b971925
commit 7e341ab
Showing
200 changed files
with
254,786 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
import sys | ||
from collections import Counter | ||
from collections import defaultdict | ||
from itertools import chain | ||
import nltk | ||
import numpy as np | ||
from scipy import sparse as sp | ||
from sklearn.linear_model import LinearRegression as LR | ||
|
||
|
||
FLOAT = np.float32 | ||
INT = np.int32 | ||
|
||
|
||
def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None): | ||
'''sliding window around n-grams in a document | ||
Args: | ||
strdoc: list of tokens (as strings) | ||
intdoc: list of indices (as ints); len(intdoc) == len(strdoc) | ||
vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys) | ||
n: n in n-gram | ||
wndo2: half the window size | ||
unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams | ||
Returns: | ||
(n-gram, int generator) generator over (n-gram, context window pairs) | ||
''' | ||
|
||
wndo2pn = wndo2+n | ||
unk = not unkgram is None | ||
for i, ngram in enumerate(nltk.ngrams(strdoc, n)): | ||
if ngram in vocabulary: | ||
yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn]) | ||
elif unk: | ||
yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn]) | ||
|
||
|
||
def counts2mat(featcoocs, featlist, shape, dtype): | ||
'''computes matrix from feature-word cooccurrence counts | ||
Args: | ||
featcoocs: dict mapping features to Counters | ||
featlist: list of features | ||
shape: matrix shape | ||
dtype: dtype of matrix | ||
Returns: | ||
sparse matrix in CSR format | ||
''' | ||
|
||
rows, cols, values = zip(*((i, j, count) for i, feat in enumerate(featlist) for j, count in featcoocs[feat].items())) | ||
return sp.coo_matrix((values, (rows, cols)), shape=shape, dtype=dtype).tocsr() | ||
|
||
|
||
def cooc_matrix(corpus, featlist, wordlist, doc2wnd=ngram_context, unk=None, overlap=False, avg=False, wei=False, interval=1000000, verbose=False, comm=None, **kwargs): | ||
'''constructs feature, word cooccurrence matrix | ||
Args: | ||
corpus: iterable of lists of strings | ||
featlist: list of hashable features | ||
wordlist: list of strings | ||
doc2wnd: takes list of tokens, list of indices, and set of features and returns a (feature, index iterable) generator | ||
unk: map words not in wordlist to this token (must be in wordlist); if None excludes OOV words | ||
overlap: if True subtracts feature count from cooccurrence of feature with any word it contains; features must be iterable | ||
avg: uses average over window size rather than cooccurrence counts | ||
wei: weight co-occurring words by distance from window | ||
interval: number of documents between conversion to sparse matrix | ||
verbose: write context matrix construction progress | ||
comm: MPI Communicator; outputs are None for non-root processes | ||
kwargs: passed to doc2wnd | ||
Returns: | ||
cooccurrence matrix in CSR format, vector of feature counts, vector of word counts | ||
''' | ||
|
||
assert not (overlap and (avg or wei)), "correcting for overlap not compatible with averaging or weighting" | ||
|
||
featset = set(featlist) | ||
featcounts = Counter() | ||
F = len(featlist) | ||
unki = -1 if unk is None else wordlist.index(unk) | ||
word2index = {word: i for i, word in enumerate(wordlist)} | ||
wordcounts = Counter() | ||
V = len(wordlist) | ||
|
||
rank, size = (0, 1) if comm is None else (comm.rank, comm.size) | ||
write = lambda msg: sys.stdout.write(msg) and sys.stdout.flush() | ||
dtype = FLOAT if (avg or wei) else INT | ||
if not rank: | ||
matrix = sp.csr_matrix((F, V), dtype=dtype) | ||
featcoocs = defaultdict(lambda: Counter()) | ||
|
||
for i, doc in enumerate(corpus): | ||
if i%size == rank: | ||
indices = [word2index.get(word, unki) for word in doc] | ||
wordcounts.update(indices) | ||
if avg: | ||
for feat, window in doc2wnd(doc, indices, featset, **kwargs): | ||
window = list(window) | ||
if window: | ||
increment = 1.0/len(window) | ||
cooccounts = featcoocs[feat] | ||
for index in window: | ||
cooccounts[index] += increment | ||
featcounts[feat] += 1 | ||
elif wei: | ||
for feat, window in doc2wnd(doc, indices, featset, **kwargs): | ||
window = list(window) | ||
if window: | ||
length = len(window) | ||
half = int(length/2) | ||
recip = 1.0/length | ||
cooccounts = featcoocs[feat] | ||
for j, index in enumerate(window[:half]): | ||
cooccounts[index] += recip/(half-j) | ||
for j, index in enumerate(window[half:]): | ||
cooccounts[index] += recip/(j+1) | ||
featcounts[feat] += 1 | ||
else: | ||
for feat, window in doc2wnd(doc, indices, featset, **kwargs): | ||
featcoocs[feat].update(window) | ||
featcounts[feat] += 1 | ||
if not (i+1)%interval: | ||
if rank: | ||
comm.send(counts2mat(featcoocs, featlist, (F, V), dtype), dest=0) | ||
else: | ||
matrix += sum((comm.recv(source=j) for j in range(1, size)), counts2mat(featcoocs, featlist, (F, V), dtype)) | ||
if verbose: | ||
write('\rProcessed '+str(i+1)+' Documents; Sparsity: '+str(matrix.nnz)+'/'+str(F*V)+'; Coverage: '+str((matrix.sum(1)>0).sum())+'/'+str(F)) | ||
featcoocs = defaultdict(lambda: Counter()) | ||
|
||
if size > 1: | ||
featcounts = comm.reduce(featcounts, root=0) | ||
wordcounts = comm.reduce(wordcounts, root=0) | ||
if rank: | ||
comm.send(counts2mat(featcoocs, featlist, (F, V), dtype), dest=0) | ||
return 3*[None] | ||
matrix += sum((comm.recv(source=j) for j in range(1, size)), counts2mat(featcoocs, featlist, (F, V), dtype)) | ||
|
||
if overlap: | ||
for feat, coocs in featcoocs.items(): | ||
count = featcounts[feat] | ||
for word in feat: | ||
index = word2index.get(word) | ||
if not index is None: | ||
coocs[index] -= count | ||
if verbose: | ||
write('\rProcessed '+str(i+1)+' Documents; Sparsity: '+str(matrix.nnz)+'/'+str(F*V)+'; Coverage: '+str((matrix.sum(1)>0).sum())+'/'+str(F)+'\n') | ||
return matrix, np.array([featcounts[feat] for feat in featlist], dtype=INT), np.array([wordcounts[word2index[word]] for word in wordlist], dtype=INT) | ||
|
||
|
||
def symmetric_cooc_matrix(corpus, wordlist, unk=None, **kwargs): | ||
'''constructs symmetric word, word cooccurrence matrix | ||
Args: | ||
corpus: iterable of lists of strings | ||
wordlist: list of strings | ||
unk: map words not in wordlist to this token (must be in wordlist); if None excludes OOV words | ||
kwargs: passed to cooc_matrix | ||
Returns: | ||
cooccurrence matrix in CSR format, vector of word counts | ||
''' | ||
|
||
unkgram = None if unk is None else (unk,) | ||
return cooc_matrix(corpus, [(word,) for word in wordlist], wordlist, unk=unk, n=1, unkgram=unkgram, **kwargs)[:2] | ||
|
||
|
||
def linear_transform(cooc_matrix, word_embeddings, word_counts, Regression=LR, weights=None, **kwargs): | ||
'''learns linear transform from context vectors to original embeddings | ||
Args: | ||
cooc_matrix: cooccurrence matrix of size (V, V) | ||
word_embeddings: embedding matrix of size (V, d) | ||
word_counts: word count vector of length V | ||
Regression: regression class (from sklearn.linear_model) | ||
weights: sample weight vector of length V; ignored if None | ||
kwargs: passed to Regression | ||
Returns: | ||
fitted Regression object | ||
''' | ||
|
||
select = word_counts > 0 | ||
if not weights is None: | ||
select *= weights > 0 | ||
weights = weights[select] | ||
|
||
return Regression(**kwargs).fit(cooc_matrix[select].dot(word_embeddings) / word_counts[select,None], word_embeddings[select], weights) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
# computes (feature, word) cooccurrence matrices given text corpora | ||
# to run use command: | ||
# mpirun -n 8 python ALaCarte/cooc.py $FEATURE $CORPUSFILE $VOCABFILE $OUTPUTROOT $N | ||
# where: | ||
# FEATURE is one of ngram, synset, word | ||
# CORPUSFILE is the corpus textfile (skipped if $FEATURE==synset) | ||
# VOCABFILE is the word vocabulary | ||
# OUTPUTROOT is the output name (script will make files $OUTPUTROOT.npz and $OUTPUTROOT.pkl) | ||
# N is a positive integer (skipped if not $FEATURE==ngram) | ||
# if $FEATURE==ngram: computes cooccurrences for all n-grams in sst,sst_fine,imdb,mr,cr,subj,mpqa,trec,mrpc,sick tasks | ||
# if $FEATURE==synset: computes cooccurrences for all synsets in SemCor | ||
|
||
import pickle | ||
import sys | ||
from itertools import chain | ||
import nltk | ||
import numpy as np | ||
from nltk.corpus import semcor | ||
from scipy import sparse as sp | ||
from ALaCarte.compute import * | ||
from text_embedding.documents import * | ||
from text_embedding.features import * | ||
|
||
|
||
UNK = '<unk>' | ||
|
||
|
||
def ngram_vocab(n): | ||
ngrams = lambda docs: {ngram for doc in tokenize(doc.lower() for doc in docs) for ngram in nltk.ngrams(doc, n)} | ||
return sorted(set.union(*(ngrams(sst_fine(partition)[0]) for partition in ['train', 'test']))) | ||
vocabulary = set.union(*(ngrams(task()[0]) for task in TASKMAP['cross-validation'].values())) | ||
for task in TASKMAP['train-test split'].values(): | ||
for partition in ['train', 'test']: | ||
try: | ||
vocabulary = vocabulary.union(ngrams(task(partition)[0])) | ||
except FileNotFoundError: | ||
pass | ||
return sorted(vocabulary) | ||
|
||
|
||
def ntokens(tokens): | ||
return len([split_on_punctuation(' '.join(tokens))]) | ||
|
||
|
||
def synset_context(sents): | ||
def context(strdoc, intdoc, vocabulary, wndo2=None, unkfeat=None): | ||
unk = not unkfeat is None | ||
wndo2 = len(intdoc) if wndo2 is None else wndo2 | ||
offset = 0 | ||
for chunk in next(sents): | ||
if type(chunk) == list: | ||
length = ntokens(chunk) | ||
else: | ||
label = chunk.label() | ||
if type(label) == str: | ||
length = ntokens(chunk) | ||
else: | ||
length = ntokens(chunk[0]) | ||
synset = label.synset() | ||
if synset in vocabulary: | ||
yield synset, chain(intdoc[offset-wndo2:offset], intdoc[offset+length:offset+length+wndo2]) | ||
elif unk: | ||
yield unkfeat, chain(intdoc[offset-wndo2:offset], intdoc[offset+length:offset+length+wndo2]) | ||
offset += length | ||
return context | ||
|
||
|
||
def synset_vocab(): | ||
return sorted({label.synset() for label in (chunk.label() for chunk in semcor.tagged_chunks(tag='sem') if not type(chunk) == list) if not type(label) == str}) | ||
|
||
|
||
def alacache(nameroot, feature='ngram'): | ||
''' function to return output of this script | ||
Args: | ||
nameroot: root of files (without extensions); the input argument 'outputroot' | ||
feature: string name of feature that was computed | ||
Returns: | ||
if file is for word x word cooccurrence: returns cooc matrix, word vocab, word counts; otherwise also returns feature vocab and featurecounts | ||
''' | ||
|
||
matrix = sp.load_npz(nameroot+'.npz') | ||
with open(nameroot+'.pkl', 'rb') as f: | ||
data = pickle.load(f) | ||
if len(data) == 2: | ||
return matrix, data['words'], data['counts'] | ||
return matrix, data['words'], data['wordcounts'], data[feature+'s'], data[feature+'counts'] | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
feature = sys.argv[1] | ||
if feature == 'ngram': | ||
corpusfile, vocabfile, outputroot, n = sys.argv[2:6] | ||
n = int(n) | ||
elif feature == 'synset': | ||
vocabfile, outputroot = sys.argv[2:4] | ||
elif feature == 'word': | ||
feature = '' | ||
corpusfile, vocabfile, outputroot = sys.argv[2:5] | ||
else: | ||
raise(NotImplementedError) | ||
|
||
with open(vocabfile, 'r') as f: | ||
vocab = [line.split(' ')[0] for line in f] | ||
vocab.append(UNK) | ||
|
||
try: | ||
from mpi4py import MPI | ||
comm = MPI.COMM_WORLD | ||
except ImportError: | ||
comm = None | ||
|
||
if feature: | ||
|
||
if feature == 'ngram': | ||
featurevocab = ngram_vocab(n) | ||
with open(corpusfile, 'r') as f: | ||
matrix, featurecounts, wordcounts = cooc_matrix((line.split() for line in f), featurevocab, vocab, n=n, unk=UNK, verbose=True, comm=comm) | ||
elif feature == 'synset': | ||
featurevocab = synset_vocab() | ||
matrix, featurecounts, wordcounts = cooc_matrix(semcor.sents(), featurevocab, vocab, doc2wnd=synset_context(iter(semcor.tagged_sents(tag='sem'))), unk=UNK, interval=100, verbose=True, wndo2=None) | ||
featurevocab = [synset.name() for synset in featurevocab] | ||
else: | ||
raise(NotImplementedError) | ||
|
||
if comm is None or not comm.rank: | ||
sp.save_npz(outputroot+'.npz', matrix) | ||
with open(outputroot+'.pkl', 'wb') as f: | ||
pickle.dump({'words': vocab, feature+'s': featurevocab, 'wordcounts': wordcounts, feature+'counts': featurecounts}, f) | ||
else: | ||
sys.exit() | ||
|
||
else: | ||
|
||
with open(corpusfile, 'r') as f: | ||
matrix, counts = symmetric_cooc_matrix((line.split() for line in f), vocab, unk=UNK, verbose=True, comm=comm) | ||
if comm is None or not comm.rank: | ||
sp.save_npz(outputroot+'.npz', matrix) | ||
with open(outputroot+'.pkl', 'wb') as f: | ||
pickle.dump({'words': vocab, 'counts': counts}, f) | ||
else: | ||
sys.exit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[Dolphin] | ||
Timestamp=2014,4,23,16,37,56 | ||
Version=3 | ||
ViewMode=1 |
Oops, something went wrong.