From b6de4b23186c2bc0142d074711d3196482441f87 Mon Sep 17 00:00:00 2001
From: Louis Chartrand <lochartrand@gmail.com>
Date: Thu, 18 Mar 2021 01:46:50 -0400
Subject: [PATCH] found gpu settings for faiss

---
 clctm.py | 64 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/clctm.py b/clctm.py
index 5b83671..edbf562 100644
--- a/clctm.py
+++ b/clctm.py
@@ -3,6 +3,7 @@
 import numpy as np
 from scipy.spatial.distance import cdist
 from scipy.special import softmax
+from scipy.stats import rankdata
 import tqdm.auto as tqdm
 import datetime as dt
 from operator import sub as substract
@@ -327,16 +328,35 @@ def _init_concept_vectors(self, corpus, sample_size=0.01, method="kmeans++", met
         assert len(corpus.input_ids) == len(corpus.token_vectors)
 
         if method == "kmeans++":
-            #NB: Not using FAISS because it's actually much slower than cdist ?!! even with gpu
             
-            # step 1
-            self.concept_vectors = [samp[choicefn(sampsize)]]
-            distances = cdist(self.concept_vectors, samp, metric=metric)
+            if faiss_avail:
+                cfg = clctm.faiss.GpuIndexFlatConfig()
+                cfg.device = 0
 
-            for i in tqdm.trange(1, self.n_concepts, desc="Kmeans++ initialization"):
-                #step 2 & 3 - note that random.multinomial is 3x faster than random.choice
-                self.concept_vectors = np.concatenate((self.concept_vectors, [samp[np.random.multinomial(1, pvals=softmax(distances.min(0)**2)).argmax()]]))
-                distances = np.concatenate((distances, cdist([self.concept_vectors[-1]], samp, metric=metric)))
+                # step 1
+                self.concept_index = faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), self.n_dims, cfg)
+                cvs = [np.random.choice(sampsize)]
+                self.concept_index.add(self.concept_vectors[cvs[0]:cvs[0]+1)
+
+                for i in tqdm.trange(1, self.n_concepts, desc="Kmeans++ initialization (with faiss)"):
+                    #step 2
+                    D, _ = self.concept_index.search(samp, 1)
+
+                    #step 3
+                    cvs.append(p.random.choice(sampsize, p=D.T[0]/D.sum()))
+                    self.concept_index.add(self.concept_vectors[cvs[-1]:cvs[-1]+1])
+
+                self.concept_vectors = samp[cvs]
+
+            else:
+                # step 1
+                self.concept_vectors = [samp[choicefn(sampsize)]]
+                distances = cdist(self.concept_vectors, samp, metric=metric)
+
+                for i in tqdm.trange(1, self.n_concepts, desc="Kmeans++ initialization"):
+                    #step 2 & 3 - note that random.multinomial is 3x faster than random.choice
+                    self.concept_vectors = np.concatenate((self.concept_vectors, [samp[np.random.multinomial(1, pvals=softmax(distances.min(0)**2)).argmax()]]))
+                    distances = np.concatenate((distances, cdist([self.concept_vectors[-1]], samp, metric=metric)))
 
         else:
             self.concept_vectors = np.random.choice(samp, size=self.n_concepts)
@@ -364,9 +384,7 @@ def kv2array(keys, values, size=None, dtype=None):
         #NB: faiss is actually much faster (x18!) than cdist here
         t0 = dt.datetime.now()
         if faiss_avail:
-            cidx = faiss.IndexFlatL2(self.n_dims)
-            cidx.add(self.concept_vectors)
-            _, c = cidx.search(corpus.token_vectors, 1)
+            _, c = self.concept_index.search(corpus.token_vectors, 1)
             self.concepts = c.T[0]
         else:
             self.concepts = cdist(corpus.token_vectors, self.concept_vectors).argmin(axis=1)
@@ -475,24 +493,38 @@ def softmax(v):
             # .maxCoeff() effectively seems to be .max()
             return r/r.sum()
 
-        def sample_c(w, wvec, z):
+        def sample_c(w, i, wvec, z):
             # NB: orig implementation reduced time here by creating "neighbor lists" for each
             # token. Obviously, cwe change even when tokens are the same, so we can't do that
             # here.
             # TODO: check if it's still faster to pick closest concepts for each word token
-            # TODO: Make sure not using softmax is ok.
             # TODO: (Maybe) check if derivation checks out? Pretty weird to me.
-            t1 = -0.5 * self.n_dims * np.log(self.sigma_c)
-            t2 = -(0.5 / self.sigma_c) * (self.mu_c_dot_mu_c - 2 * self.mu_c @ wvec)
+            nbindices = self.token_neighbors[i]
+            t1 = -0.5 * self.n_dims * np.log(self.sigma_c[nbindices])
+            t2 = -(0.5 / self.sigma_c[nbindices]) * (self.mu_c_dot_mu_c - 2 * self.mu_c[nbindices] @ wvec)
 
-            prob = softmax(np.log(self.n_zc[z] + self.beta) + t1 + t2)
+            prob = softmax(np.log(self.n_zc[z, nbindices] + self.beta) + t1 + t2)
 
             return np.random.choice(list(range(self.n_concepts)), p=prob)
 
+        def create_neighbor_list():
+            if faiss_avail:
+                
+
         pbg = tqdm.tqdm(total=self.n_iter, desc="Iterations")
         pbdoc = tqdm.tqdm(total=self.n_docs, desc="Documents")
        
         for it in range(self.n_iter):
+            if it % 5 == 0:
+                if faiss_avail:
+                    _, self.token_neighbors = self.concept_index.search(corpus.token_vectors, self.nneighbors)
+                else:
+                    self.token_neigbors = rankdata(
+                        cdict(corpus.token_vectors, self.concept_vectors),
+                        axis=1,
+                        method="dense"
+                    )[:,:self.nneighbors]
+
             num_z_changed = 0
             num_c_changed = 0
             num_omit = 0