Skip to content

Commit

Permalink
Fixes aws#902 (aws#1632)
Browse files Browse the repository at this point in the history
* fix probability out of bound

* fixed probability out of bound

* cleared the notebook output

* fix of probabilities out of bound
  • Loading branch information
hongshanli23 authored Oct 20, 2020
1 parent 181f8d6 commit f783cfd
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
vocabulary_size = 25
image_dim = np.int(np.sqrt(vocabulary_size))

# to be used for numercial stability
epsilon = np.finfo(float).eps

# perform checks on input
assert num_topics in [5,10], 'Example data only available for 5 or 10 topics'
if alpha:
Expand All @@ -89,18 +92,23 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
# create the col topics. when num_topics = 10 add the row topics as well
#
beta = np.zeros((num_topics,image_dim,image_dim), dtype=np.float)


for i in range(image_dim):
beta[i,:,i] = dirichlet_eta.rvs(size=1)
if num_topics == 10:
for i in range(image_dim):
beta[i+image_dim,i,:] = dirichlet_eta.rvs(size=1)
beta.resize(num_topics, vocabulary_size)
# normalize beta to ensure each row is a valid probability dist
beta /= (1 + epsilon)

# generate documents using the LDA model / provess
#
document_lengths = sp.stats.poisson(average_document_length).rvs(size=num_documents)
documents = np.zeros((num_documents,vocabulary_size), dtype=np.float)
thetas = dirichlet_alpha.rvs(size=num_documents) # precompute topic distributions for performance
thetas /=(1+epsilon)
for m in range(num_documents):
document_length = document_lengths[m]
theta = thetas[m]
Expand Down Expand Up @@ -193,3 +201,5 @@ def plot_lda_topics(documents, nrows, ncols, with_colorbar=True,
vmin=vmin, vmax=vmax)

return fig


Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
"""
vocabulary_size = 25
image_dim = np.int(np.sqrt(vocabulary_size))

# to be used for numercial stability
epsilon = np.finfo(float).eps

# perform checks on input
assert num_topics in [5,10], 'Example data only available for 5 or 10 topics'
Expand Down Expand Up @@ -98,9 +101,14 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,

# generate documents using the LDA model / provess
#
# normalize beta to ensure each row is a valid probability dist
beta /= (1 + epsilon)

document_lengths = sp.stats.poisson(average_document_length).rvs(size=num_documents)
documents = np.zeros((num_documents,vocabulary_size), dtype=np.float)
thetas = dirichlet_alpha.rvs(size=num_documents) # precompute topic distributions for performance
thetas /= (1 + epsilon)

for m in range(num_documents):
document_length = document_lengths[m]
theta = thetas[m]
Expand Down

0 comments on commit f783cfd

Please sign in to comment.