Skip to content

Commit

Permalink
adding topic indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
ddangelov committed Nov 2, 2023
1 parent fd2079b commit 42a764b
Showing 1 changed file with 172 additions and 50 deletions.
222 changes: 172 additions & 50 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ class Top2Vec:
Tokenizer must take a document and return a list of tokens.
use_embedding_model_tokenizer: bool (Optional, default False)
use_embedding_model_tokenizer: bool (Optional, default True)
If using an embedding model other than doc2vec, use the model's
tokenizer for document embedding. If set to True the tokenizer, either
default or passed callable will be used to tokenize the text to
Expand All @@ -372,13 +372,24 @@ class Top2Vec:
Pass custom arguments to UMAP.
gpu_umap: bool (default False)
If True umap will use the rapidsai cuml library to perform the
dimensionality reduction. This will lead to a significant speedup
in the computation time. To install rapidsai cuml follow the
instructions here: https://docs.rapids.ai/install
If True umap will use the rapidsai cuml library to perform the
dimensionality reduction. This will lead to a significant speedup
in the computation time during model createion. To install rapidsai
cuml follow the instructions here: https://docs.rapids.ai/install
hdbscan_args: dict (Optional, default None)
Pass custom arguments to HDBSCAN.
gpu_hdbscan: bool (default False)
If True hdbscan will use the rapidsai cuml library to perform the
clustering. This will lead to a significant speedup in the computation
time during model creation. To install rapidsai cuml follow the
instructions here: https://docs.rapids.ai/install
index_topics: bool (Optional, default False)
If True, the topic vectors will be indexed using hnswlib. This will
significantly speed up finding topics during model creation for
very large datasets.
verbose: bool (Optional, default True)
Whether to print status data during training.
Expand Down Expand Up @@ -406,10 +417,12 @@ def __init__(self,
keep_documents=True,
workers=None,
tokenizer=None,
use_embedding_model_tokenizer=False,
use_embedding_model_tokenizer=True,
umap_args=None,
gpu_umap=False,
hdbscan_args=None,
gpu_hdbscan=False,
index_topics=False,
verbose=True
):

Expand Down Expand Up @@ -680,7 +693,9 @@ def return_doc(doc):
self.compute_topics(umap_args=umap_args,
hdbscan_args=hdbscan_args,
topic_merge_delta=topic_merge_delta,
gpu_umap=gpu_umap)
gpu_umap=gpu_umap,
gpu_hdbscan=gpu_hdbscan,
index_topics=index_topics)

# initialize document indexing variables
self.document_index = None
Expand All @@ -694,6 +709,11 @@ def return_doc(doc):
self.serialized_word_index = None
self.words_indexed = False

# initialize topic indexing variables
self.topic_index = None
self.serialized_topic_index = None
self.topic_indexed = False

def save(self, file):
"""
Saves the current model to the specified file.
Expand All @@ -706,6 +726,7 @@ def save(self, file):

document_index_temp = None
word_index_temp = None
topic_index_temp = None

# do not save sentence encoders, sentence transformers and custom embedding
if self.embedding_model not in ["doc2vec"]:
Expand All @@ -729,10 +750,20 @@ def save(self, file):
word_index_temp = self.word_index
self.word_index = None

# serialize word index so that it can be saved
if self.topics_indexed:
temp = tempfile.NamedTemporaryFile(mode='w+b')
self.topic_index.save_index(temp.name)
self.serialized_topic_index = temp.read()
temp.close()
topic_index_temp = self.topic_index
self.topic_index = None

dump(self, file)

self.document_index = document_index_temp
self.word_index = word_index_temp
self.topic_index = topic_index_temp

@classmethod
def load(cls, file):
Expand Down Expand Up @@ -781,6 +812,23 @@ def load(cls, file):
temp.close()
top2vec_model.serialized_word_index = None

# load topic index
if top2vec_model.words_indexed:

if not _HAVE_HNSWLIB:
raise ImportError(f"Cannot load word index.\n\n"
"Try: pip install top2vec[indexing]\n\n"
"Alternatively try: pip install hnswlib")

temp = tempfile.NamedTemporaryFile(mode='w+b')
temp.write(top2vec_model.serialized_topic_index)
topic_vectors = top2vec_model.topic_vectors
top2vec_model.topic_index = hnswlib.Index(space='ip',
dim=topic_vectors.shape[1])
top2vec_model.topic_index.load_index(temp.name, max_elements=topic_vectors.shape[0])
temp.close()
top2vec_model.serialized_topic_index = None

return top2vec_model

@staticmethod
Expand Down Expand Up @@ -884,55 +932,68 @@ def _reorder_topics(self, hierarchy=False):
self.topic_sizes.reset_index(drop=True, inplace=True)

@staticmethod
def _calculate_documents_topic(topic_vectors, document_vectors, dist=True, num_topics=None):
batch_size = 10000
doc_top = []
if dist:
def _calculate_documents_topic(topic_vectors,
document_vectors,
dist=True,
num_topics=None,
topic_index=None):

if topic_index is not None:
doc_top = []
doc_dist = []

if document_vectors.shape[0] > batch_size:
current = 0
batches = int(document_vectors.shape[0] / batch_size)
extra = document_vectors.shape[0] % batch_size

for ind in range(0, batches):
res = np.inner(document_vectors[current:current + batch_size], topic_vectors)

if num_topics is None:
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])

current += batch_size

if extra > 0:
res = np.inner(document_vectors[current:current + extra], topic_vectors)
for vector in document_vectors:
ids, scores = topic_index.knn_query(vector, k=1)
doc_top.append(ids[0][0])
doc_dist.append(1 - scores[0][0])
else:
batch_size = 10000
doc_top = []
if dist:
doc_dist = []

if document_vectors.shape[0] > batch_size:
current = 0
batches = int(document_vectors.shape[0] / batch_size)
extra = document_vectors.shape[0] % batch_size

for ind in range(0, batches):
res = np.inner(document_vectors[current:current + batch_size], topic_vectors)

if num_topics is None:
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])

current += batch_size

if extra > 0:
res = np.inner(document_vectors[current:current + extra], topic_vectors)

if num_topics is None:
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
if dist:
doc_dist = np.array(doc_dist)
else:
res = np.inner(document_vectors, topic_vectors)

if num_topics is None:
doc_top.extend(np.argmax(res, axis=1))
doc_top = np.argmax(res, axis=1)
if dist:
doc_dist.extend(np.max(res, axis=1))
doc_dist = np.max(res, axis=1)
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
if dist:
doc_dist = np.array(doc_dist)
else:
res = np.inner(document_vectors, topic_vectors)

if num_topics is None:
doc_top = np.argmax(res, axis=1)
if dist:
doc_dist = np.max(res, axis=1)
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])

if num_topics is not None:
doc_top = np.array(doc_top)
Expand Down Expand Up @@ -1235,7 +1296,13 @@ def _validate_vector(self, vector):
if not vector.shape[0] == vec_size:
raise ValueError(f"Vector needs to be of {vec_size} dimensions.")

def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.1, gpu_umap=False):
def compute_topics(self,
umap_args=None,
hdbscan_args=None,
topic_merge_delta=0.1,
gpu_umap=False,
gpu_hdbscan=False,
index_topics=False):
"""
Computes topics from current document vectors.
Expand Down Expand Up @@ -1266,6 +1333,17 @@ def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.
dimensionality reduction. This will lead to a significant speedup
in the computation time. To install rapidsai cuml follow the
instructions here: https://docs.rapids.ai/install
gpu_hdbscan: bool (default False)
If True hdbscan will use the rapidsai cuml library to perform the
clustering. This will lead to a significant speedup
in the computation time. To install rapidsai cuml follow the
instructions here: https://docs.rapids.ai/install
index_topics: bool (default False)
If True the topic vectors will be indexed using hnswlib. This will
lead to faster search times for models with a large number of
topics.
"""

# create 5D embeddings of documents
Expand Down Expand Up @@ -1305,9 +1383,16 @@ def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.
# find topic words and scores
self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)

if index_topics:
self.index_topic_vectors()
topic_index = self.topic_index
else:
topic_index = None

# assign documents to topic
self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
self.document_vectors)
self.document_vectors,
topic_index=topic_index)

# calculate topic sizes
self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)
Expand Down Expand Up @@ -1402,6 +1487,43 @@ def index_word_vectors(self, ef_construction=200, M=64):
self.word_index.add_items(word_vectors, index_ids)
self.words_indexed = True

def index_topic_vectors(self, ef_construction=200, M=64):
"""
Creates an index of the topic vectors using hnswlib. This will
lead to faster search times for models with a large number of
topics.
For more information on hnswlib see: https://github.com/nmslib/hnswlib
Parameters
----------
ef_construction: int (Optional default 200)
This parameter controls the trade-off between index construction
time and index accuracy. Larger values will lead to greater
accuracy but will take longer to construct.
M: int (Optional default 64)
This parameter controls the trade-off between both index size as
well as construction time and accuracy. Larger values will lead to
greater accuracy but will result in a larger index as well as
longer construction time.
For more information on the parameters see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
"""
self._check_hnswlib_status()

topic_vectors = self.topic_vectors
vec_dim = topic_vectors.shape[1]
num_vecs = topic_vectors.shape[0]

index_ids = list(range(0, num_vecs))

self.topic_index = hnswlib.Index(space='ip', dim=vec_dim)
self.topic_index.init_index(max_elements=num_vecs, ef_construction=ef_construction, M=M)
self.topic_index.add_items(topic_vectors, index_ids)
self.topics_indexed = True

def set_embedding_model(self, embedding_model):
"""
Set the embedding model. This is called after loading a saved Top2Vec
Expand Down

0 comments on commit 42a764b

Please sign in to comment.