May 17, 2018
1 parent b971925 commit 7e341ab
import sys
from collections import Counter
from collections import defaultdict
from itertools import chain
import nltk
import numpy as np
from scipy import sparse as sp
from sklearn.linear_model import LinearRegression as LR

FLOAT = np.float32
INT = np.int32

def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None):
'''sliding window around n-grams in a document
strdoc: list of tokens (as strings)
intdoc: list of indices (as ints); len(intdoc) == len(strdoc)
vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys)
n: n in n-gram
wndo2: half the window size
unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams
(n-gram, int generator) generator over (n-gram, context window pairs)

wndo2pn = wndo2+n
unk = not unkgram is None
for i, ngram in enumerate(nltk.ngrams(strdoc, n)):
if ngram in vocabulary:
yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
elif unk:
yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])

def counts2mat(featcoocs, featlist, shape, dtype):
'''computes matrix from feature-word cooccurrence counts
featcoocs: dict mapping features to Counters
featlist: list of features
shape: matrix shape
dtype: dtype of matrix
sparse matrix in CSR format

rows, cols, values = zip(*((i, j, count) for i, feat in enumerate(featlist) for j, count in featcoocs[feat].items()))
return sp.coo_matrix((values, (rows, cols)), shape=shape, dtype=dtype).tocsr()

def cooc_matrix(corpus, featlist, wordlist, doc2wnd=ngram_context, unk=None, overlap=False, avg=False, wei=False, interval=1000000, verbose=False, comm=None, **kwargs):
'''constructs feature, word cooccurrence matrix
corpus: iterable of lists of strings
featlist: list of hashable features
wordlist: list of strings
doc2wnd: takes list of tokens, list of indices, and set of features and returns a (feature, index iterable) generator
unk: map words not in wordlist to this token (must be in wordlist); if None excludes OOV words
overlap: if True subtracts feature count from cooccurrence of feature with any word it contains; features must be iterable
avg: uses average over window size rather than cooccurrence counts
wei: weight co-occurring words by distance from window
interval: number of documents between conversion to sparse matrix
verbose: write context matrix construction progress
comm: MPI Communicator; outputs are None for non-root processes
kwargs: passed to doc2wnd
cooccurrence matrix in CSR format, vector of feature counts, vector of word counts

assert not (overlap and (avg or wei)), "correcting for overlap not compatible with averaging or weighting"

featset = set(featlist)
featcounts = Counter()
F = len(featlist)
unki = -1 if unk is None else wordlist.index(unk)
word2index = {word: i for i, word in enumerate(wordlist)}
wordcounts = Counter()
V = len(wordlist)

rank, size = (0, 1) if comm is None else (comm.rank, comm.size)
write = lambda msg: sys.stdout.write(msg) and sys.stdout.flush()
dtype = FLOAT if (avg or wei) else INT
if not rank:
matrix = sp.csr_matrix((F, V), dtype=dtype)
featcoocs = defaultdict(lambda: Counter())

for i, doc in enumerate(corpus):
if i%size == rank:
indices = [word2index.get(word, unki) for word in doc]
if avg:
for feat, window in doc2wnd(doc, indices, featset, **kwargs):
window = list(window)
if window:
increment = 1.0/len(window)
cooccounts = featcoocs[feat]
for index in window:
cooccounts[index] += increment
featcounts[feat] += 1
elif wei:
for feat, window in doc2wnd(doc, indices, featset, **kwargs):
window = list(window)
if window:
length = len(window)
half = int(length/2)
recip = 1.0/length
cooccounts = featcoocs[feat]
for j, index in enumerate(window[:half]):
cooccounts[index] += recip/(half-j)
for j, index in enumerate(window[half:]):
cooccounts[index] += recip/(j+1)
featcounts[feat] += 1
for feat, window in doc2wnd(doc, indices, featset, **kwargs):
featcounts[feat] += 1
if not (i+1)%interval:
if rank:
comm.send(counts2mat(featcoocs, featlist, (F, V), dtype), dest=0)
matrix += sum((comm.recv(source=j) for j in range(1, size)), counts2mat(featcoocs, featlist, (F, V), dtype))
if verbose:
write('\rProcessed '+str(i+1)+' Documents; Sparsity: '+str(matrix.nnz)+'/'+str(F*V)+'; Coverage: '+str((matrix.sum(1)>0).sum())+'/'+str(F))
featcoocs = defaultdict(lambda: Counter())

if size > 1:
featcounts = comm.reduce(featcounts, root=0)
wordcounts = comm.reduce(wordcounts, root=0)
if rank:
comm.send(counts2mat(featcoocs, featlist, (F, V), dtype), dest=0)
return 3*[None]
matrix += sum((comm.recv(source=j) for j in range(1, size)), counts2mat(featcoocs, featlist, (F, V), dtype))

if overlap:
for feat, coocs in featcoocs.items():
count = featcounts[feat]
for word in feat:
index = word2index.get(word)
if not index is None:
coocs[index] -= count
if verbose:
write('\rProcessed '+str(i+1)+' Documents; Sparsity: '+str(matrix.nnz)+'/'+str(F*V)+'; Coverage: '+str((matrix.sum(1)>0).sum())+'/'+str(F)+'\n')
return matrix, np.array([featcounts[feat] for feat in featlist], dtype=INT), np.array([wordcounts[word2index[word]] for word in wordlist], dtype=INT)

def symmetric_cooc_matrix(corpus, wordlist, unk=None, **kwargs):
'''constructs symmetric word, word cooccurrence matrix
corpus: iterable of lists of strings
wordlist: list of strings
unk: map words not in wordlist to this token (must be in wordlist); if None excludes OOV words
kwargs: passed to cooc_matrix
cooccurrence matrix in CSR format, vector of word counts

unkgram = None if unk is None else (unk,)
return cooc_matrix(corpus, [(word,) for word in wordlist], wordlist, unk=unk, n=1, unkgram=unkgram, **kwargs)[:2]

def linear_transform(cooc_matrix, word_embeddings, word_counts, Regression=LR, weights=None, **kwargs):
'''learns linear transform from context vectors to original embeddings
cooc_matrix: cooccurrence matrix of size (V, V)
word_embeddings: embedding matrix of size (V, d)
word_counts: word count vector of length V
Regression: regression class (from sklearn.linear_model)
weights: sample weight vector of length V; ignored if None
kwargs: passed to Regression
fitted Regression object

select = word_counts > 0
if not weights is None:
select *= weights > 0
weights = weights[select]

return Regression(**kwargs).fit(cooc_matrix[select].dot(word_embeddings) / word_counts[select,None], word_embeddings[select], weights)
# computes (feature, word) cooccurrence matrices given text corpora
# to run use command:
# where:
# FEATURE is one of ngram, synset, word
# CORPUSFILE is the corpus textfile (skipped if $FEATURE==synset)
# VOCABFILE is the word vocabulary
# OUTPUTROOT is the output name (script will make files $OUTPUTROOT.npz and $OUTPUTROOT.pkl)
# N is a positive integer (skipped if not $FEATURE==ngram)
# if $FEATURE==ngram: computes cooccurrences for all n-grams in sst,sst_fine,imdb,mr,cr,subj,mpqa,trec,mrpc,sick tasks
# if $FEATURE==synset: computes cooccurrences for all synsets in SemCor

import pickle
import sys
from itertools import chain
import nltk
import numpy as np
from nltk.corpus import semcor
from scipy import sparse as sp
from ALaCarte.compute import *
from text_embedding.documents import *
from text_embedding.features import *

UNK = '<unk>'

def ngram_vocab(n):
ngrams = lambda docs: {ngram for doc in tokenize(doc.lower() for doc in docs) for ngram in nltk.ngrams(doc, n)}
return sorted(set.union(*(ngrams(sst_fine(partition)[0]) for partition in ['train', 'test'])))
vocabulary = set.union(*(ngrams(task()[0]) for task in TASKMAP['cross-validation'].values()))
for task in TASKMAP['train-test split'].values():
for partition in ['train', 'test']:
vocabulary = vocabulary.union(ngrams(task(partition)[0]))
except FileNotFoundError:
return sorted(vocabulary)

def ntokens(tokens):
return len([split_on_punctuation(' '.join(tokens))])

def synset_context(sents):
def context(strdoc, intdoc, vocabulary, wndo2=None, unkfeat=None):
unk = not unkfeat is None
wndo2 = len(intdoc) if wndo2 is None else wndo2
offset = 0
for chunk in next(sents):
if type(chunk) == list:
length = ntokens(chunk)
label = chunk.label()
if type(label) == str:
length = ntokens(chunk)
length = ntokens(chunk[0])
synset = label.synset()
if synset in vocabulary:
yield synset, chain(intdoc[offset-wndo2:offset], intdoc[offset+length:offset+length+wndo2])
elif unk:
yield unkfeat, chain(intdoc[offset-wndo2:offset], intdoc[offset+length:offset+length+wndo2])
offset += length
return context

def synset_vocab():
return sorted({label.synset() for label in (chunk.label() for chunk in semcor.tagged_chunks(tag='sem') if not type(chunk) == list) if not type(label) == str})

def alacache(nameroot, feature='ngram'):
''' function to return output of this script
nameroot: root of files (without extensions); the input argument 'outputroot'
feature: string name of feature that was computed
if file is for word x word cooccurrence: returns cooc matrix, word vocab, word counts; otherwise also returns feature vocab and featurecounts

matrix = sp.load_npz(nameroot+'.npz')
with open(nameroot+'.pkl', 'rb') as f:
data = pickle.load(f)
if len(data) == 2:
return matrix, data['words'], data['counts']
return matrix, data['words'], data['wordcounts'], data[feature+'s'], data[feature+'counts']

if __name__ == '__main__':

feature = sys.argv[1]
if feature == 'ngram':
corpusfile, vocabfile, outputroot, n = sys.argv[2:6]
n = int(n)
elif feature == 'synset':
vocabfile, outputroot = sys.argv[2:4]
elif feature == 'word':
feature = ''
corpusfile, vocabfile, outputroot = sys.argv[2:5]

with open(vocabfile, 'r') as f:
vocab = [line.split(' ')[0] for line in f]

from mpi4py import MPI
except ImportError:
comm = None

if feature:

if feature == 'ngram':
featurevocab = ngram_vocab(n)
with open(corpusfile, 'r') as f:
matrix, featurecounts, wordcounts = cooc_matrix((line.split() for line in f), featurevocab, vocab, n=n, unk=UNK, verbose=True, comm=comm)
elif feature == 'synset':
featurevocab = synset_vocab()
matrix, featurecounts, wordcounts = cooc_matrix(semcor.sents(), featurevocab, vocab, doc2wnd=synset_context(iter(semcor.tagged_sents(tag='sem'))), unk=UNK, interval=100, verbose=True, wndo2=None)
featurevocab = [ for synset in featurevocab]

if comm is None or not comm.rank:
sp.save_npz(outputroot+'.npz', matrix)
with open(outputroot+'.pkl', 'wb') as f:
pickle.dump({'words': vocab, feature+'s': featurevocab, 'wordcounts': wordcounts, feature+'counts': featurecounts}, f)


with open(corpusfile, 'r') as f:
matrix, counts = symmetric_cooc_matrix((line.split() for line in f), vocab, unk=UNK, verbose=True, comm=comm)
if comm is None or not comm.rank:
sp.save_npz(outputroot+'.npz', matrix)
with open(outputroot+'.pkl', 'wb') as f:
pickle.dump({'words': vocab, 'counts': counts}, f)
