Add files via upload

Youssef-Matloob · May 28, 2020 · 68cb891 · 68cb891
1 parent 1e68bb2
commit 68cb891
Show file tree

Hide file tree

Showing 26 changed files with 10,810 additions and 0 deletions.
diff --git a/__pycache__/runServer.cpython-36.pyc b/__pycache__/runServer.cpython-36.pyc
diff --git a/__pycache__/runServer.cpython-37.pyc b/__pycache__/runServer.cpython-37.pyc
diff --git a/app.py b/app.py
@@ -0,0 +1,201 @@
+from runServer import app
+from flask import render_template, request
+from bokeh.plotting import figure, output_file, show
+from bokeh.embed import components
+import numpy as np 
+import pandas as pd
+from sklearn.feature_extraction import text
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+import scispacy
+import spacy
+import en_core_sci_lg
+from scipy.spatial.distance import cosine
+import joblib
+from IPython.display import HTML, display
+from ipywidgets import interact, Layout, HBox, VBox, Box
+import ipywidgets as widgets
+from IPython.display import clear_output
+from tqdm import tqdm
+from os.path import isfile
+import seaborn as sb
+import matplotlib.pyplot as plt
+from joblib import dump , load
+
+nlp = en_core_sci_lg.load(disable=["tagger", "parser", "ner"])
+nlp.max_length = 2000000
+
+
+
+def spacy_tokenizer(sentence):
+    return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space or len(word)==1)]
+
+def print_top_words(model, vectorizer, n_top_words):
+    feature_names = vectorizer.get_feature_names()
+    for topic_idx, topic in enumerate(model.components_):
+        message = "\nTopic #%d: " % topic_idx
+        message += " ".join([feature_names[i]
+                             for i in topic.argsort()[:-n_top_words - 1:-1]])
+        #print(message)
+    #print()
+
+def get_k_nearest_docs(doc_dist, k=5, lower=1950, upper=2020, only_covid19=False, get_dist=False):
+    '''
+    doc_dist: topic distribution (sums to 1) of one article
+    
+    Returns the index of the k nearest articles (as by Jensen–Shannon divergence in topic space). 
+    '''
+
+    relevant_time = df_covid.publish_year.between(lower, upper)
+    #relevant_time = '2019'
+    if only_covid19:
+        temp = doc_topic_dist[relevant_time & is_covid19_article]
+
+    else:
+        temp = doc_topic_dist[relevant_time]
+
+    distances = temp.apply(lambda x: cosine(x, doc_dist), axis=1)
+    k_nearest = distances[distances != 0].nsmallest(n=k).index
+
+    if get_dist:
+        k_distances = distances[distances != 0].nsmallest(n=k)
+        return k_nearest, k_distances
+    else:
+        return k_nearest
+
+def plot_article_dna(paper_id, width=20):
+    t = df_covid[df_covid.paper_id == paper_id].title.values[0]
+    doc_topic_dist[df_covid.paper_id == paper_id].T.plot(kind='bar', legend=None, title=t, figsize=(width, 4))
+    plt.xlabel('Topic')
+
+def compare_dnas(paper_id, recommendation_id, width=20):
+    t = df_covid[df_covid.paper_id == recommendation_id].title.values[0]
+    temp = doc_topic_dist[df_covid.paper_id == paper_id]
+    ymax = temp.max(axis=1).values[0]*1.25
+    temp = pd.concat([temp, doc_topic_dist[df_covid.paper_id == recommendation_id]])
+    temp.T.plot(kind='bar', title=t, figsize=(width, 4), ylim= [0, ymax])
+    plt.xlabel('Topic')
+    plt.legend(['Selection', 'Recommendation'])
+
+# compare_dnas('90b5ecf991032f3918ad43b252e17d1171b4ea63', 'a137eb51461b4a4ed3980aa5b9cb2f2c1cf0292a')
+
+def dna_tabs(paper_ids):
+    k = len(paper_ids)
+    outs = [widgets.Output() for i in range(k)]
+
+    tab = widgets.Tab(children = outs)
+    tab_titles = ['Paper ' + str(i+1) for i in range(k)]
+    for i, t in enumerate(tab_titles):
+        tab.set_title(i, t)
+    display(tab)
+
+    for i, t in enumerate(tab_titles):
+        with outs[i]:
+            ax = plot_article_dna(paper_ids[i])
+            plt.show(ax)
+
+def compare_tabs(paper_id, recommendation_ids):
+    k = len(recommendation_ids)
+    outs = [widgets.Output() for i in range(k)]
+
+    tab = widgets.Tab(children = outs)
+    tab_titles = ['Paper ' + str(i+1) for i in range(k)]
+    for i, t in enumerate(tab_titles):
+        tab.set_title(i, t)
+    display(tab)
+
+    for i, t in enumerate(tab_titles):
+        with outs[i]:
+            ax = compare_dnas(paper_id, recommendation_ids[i])
+            plt.show(ax)
+
+def recommendation(paper_id, k=5, lower=2000, upper=2020, only_covid19=False, plot_dna=False):
+    '''
+    Returns the title of the k papers that are closest (topic-wise) to the paper given by paper_id.
+    '''
+
+    print(df_covid.title[df_covid.paper_id == paper_id].values[0])
+
+    recommended, dist = get_k_nearest_docs(doc_topic_dist[df_covid.paper_id == paper_id].iloc[0], k, lower, upper, only_covid19, get_dist=True)
+    recommended = df_covid.iloc[recommended].copy()
+    recommended['similarity'] = 1 - dist 
+
+
+    #h = '/n'.join([ n + '/n' +' (Similarity: ' + "{:.2f}".format(s) + ')' for  n, s in recommended[['title', 'similarity']].values])
+    #display(HTML(h))
+    print(recommended[['title', 'similarity']].values)
+    # for  n, s in recommended[['title', 'similarity']].values:
+    #   print(n)
+    #   print(s)
+
+    if plot_dna:
+        compare_tabs(paper_id, recommended.paper_id.values)
+
+
+def relevant_articles(tasks, k=3, lower=2000, upper=2020, only_covid19=False):
+    tasks = [tasks] if type(tasks) is str else tasks 
+
+    tasks_vectorized = vectorizer.transform(tasks)
+    tasks_topic_dist = pd.DataFrame(lda.transform(tasks_vectorized))
+
+    for index, bullet in enumerate(tasks):
+        print(bullet)
+        recommended ,dist= get_k_nearest_docs(tasks_topic_dist.iloc[index], k, lower, upper, only_covid19, get_dist=True)
+        recommended = df_covid.iloc[recommended]
+
+        #recommended = df_covid.iloc[recommended].copy()
+        recommended['similarity'] = 1 - dist 
+
+        print(recommended)
+        #h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' for l, n in recommended[['url','title']].values])
+        #display(HTML(h))
+        return recommended
+
+
+
+df_covid = load('df_covid.csv')
+vectorizer = load('vectorizer.csv')
+data_vectorized = load('data_vectorized.csv')
+lda = load('lda.csv')
+doc_topic_dist = pd.read_csv('doc_topic_dist.csv')
+is_covid19_article = df_covid.body_text.str.contains('COVID-19|SARS-CoV-2|2019-nCov|SARS Coronavirus 2|2019 Novel Coronavirus')
+
+
+
+@app.route('/')
+def hello_world(name = None):
+    return render_template("home.html")
+
+@app.route('/predict',methods=['POST','GET'])
+def predict():
+        searchText = [str(x) for x in request.form.values()][0]
+        print(searchText)
+        #
+        # Model code
+        #
+        #
+
+        results = relevant_articles(searchText)
+        title = []
+        sim = []
+        content = []
+        url = []
+        for i,row in results.iterrows():
+            abstract = str(row['abstract'])
+            abstract = abstract.replace("<br>", " ")
+            content.append(abstract)
+            sim.append(row['similarity'])
+            title_ = str(row['title'])
+            #print(title_)
+            title_ = title_.replace("<br>", " ")
+            #print(title_)
+            title.append(title_)
+            url.append(row['url'])
+        #title = ["Pikachu", "Charizard", "Squirtle"]
+        #content = ["Pikachu", "Charizard", "Squirtle"]
+        #sim = [10, 40 , 50]
+        #url = ["Pikachu", "Charizard", "Squirtle"]
+        return render_template("predict.html", len = len(title), title = title, content = content, sim = sim, url = url, searchText = searchText)
+
+if __name__ == "__main__":
+    app.run(host= '127.0.0.1', debug=True)
diff --git a/loader.py b/loader.py
@@ -0,0 +1,144 @@
+import numpy as np 
+import pandas as pd 
+import glob
+import json
+import gc
+import matplotlib.pyplot as plt
+
+plt.style.use('ggplot')
+pd.set_option('display.max_columns', 30)
+
+root_path = '/home/mohamed/Desktop/Codes/Covid-Data'
+metadata_path = f'{root_path}/metadata.csv'
+meta_df = pd.read_csv(metadata_path, dtype={
+    'pubmed_id': str,
+    'Microsoft Academic Paper ID': str, 
+    'doi': str
+})
+
+#print(meta_df.head())
+
+all_json = glob.glob(f'{root_path}/**/**/*.json', recursive=True)
+
+#print(len(all_json))
+
+class FileReader:
+    def __init__(self, file_path):
+        with open(file_path) as file:
+            content = json.load(file)
+            #print(content.keys())
+            self.paper_id = content['paper_id']
+            self.abstract = []
+            self.body_text = []
+            # Abstract
+            try :
+                for entry in content['abstract']:
+                    self.abstract.append(entry['text'])
+            except Exception as e:
+                pass
+            # Body text
+            for entry in content['body_text']:
+                self.body_text.append(entry['text'])
+            self.abstract = '\n'.join(self.abstract)
+            self.body_text = '\n'.join(self.body_text)
+
+
+    def __repr__(self):
+        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
+#first_row = FileReader(all_json[0])
+
+#print(first_row)
+
+class loader():
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def get_breaks(content, length):
+        data = ""
+        words = content.split(' ')
+        total_chars = 0
+
+        # add break every length characters
+        for i in range(len(words)):
+            total_chars += len(words[i])
+            if total_chars > length:
+                data = data + "<br>" + words[i]
+                total_chars = 0
+            else:
+                data = data + " " + words[i]
+        return data
+
+    def load(self,start = 0, end = 1000):
+        dict_ = {'paper_id': [], 'abstract': [], 'body_text': [],'publish_year':[],'url':[],'WHO #Covidence':[] ,'license':[],'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
+        for idx, entry in enumerate(all_json[:end]):
+            #print(dict_)
+            if idx % (len(all_json) // 100) == 0:
+                print(f'Processing index: {idx} of {len(all_json)}')
+            content = FileReader(entry)
+            # get metadata information
+            #print(content.paper_id)
+            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
+            #print("meta data ", meta_data)
+            # no metadata, skip this paper
+            if len(meta_data) == 0:
+                #print("%%%%%%%%%%%%%%%%%55")
+                continue
+
+            dict_['paper_id'].append(content.paper_id)
+            dict_['abstract'].append(content.abstract)
+            dict_['body_text'].append(content.body_text)
+            dict_['publish_year'].append(pd.DatetimeIndex(meta_data['publish_time']).year.values[0])   
+            dict_['url'].append(meta_data['url'].values[0])
+            dict_['WHO #Covidence'].append(meta_data['WHO #Covidence'].values[0])
+            dict_['license'].append(meta_data['license'].values[0])
+            # also create a column for the summary of abstract to be used in a plot
+            if len(content.abstract) == 0: 
+                # no abstract provided
+                dict_['abstract_summary'].append("Not provided.")
+            elif len(content.abstract.split(' ')) > 100:
+                # abstract provided is too long for plot, take first 300 words append with ...
+                info = content.abstract.split(' ')[:100]
+                summary = self.get_breaks(' '.join(info), 40)
+                dict_['abstract_summary'].append(summary + "...")
+            else:
+                # abstract is short enough
+                summary = self.get_breaks(content.abstract, 40)
+                dict_['abstract_summary'].append(summary)
+
+            # get metadata information
+            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
+
+            try:
+                # if more than one author
+                authors = meta_data['authors'].values[0].split(';')
+                if len(authors) > 2:
+                    # more than 2 authors, may be problem when plotting, so take first 2 append with ...
+                    dict_['authors'].append(". ".join(authors[:2]) + "...")
+                else:
+                    # authors will fit in plot
+                    dict_['authors'].append(". ".join(authors))
+            except Exception as e:
+                # if only one author - or Null valie
+                dict_['authors'].append(meta_data['authors'].values[0])
+
+            # add the title information, add breaks when needed
+            try:
+                title = self.get_breaks(meta_data['title'].values[0], 40)
+                dict_['title'].append(title)
+            # if title was not provided
+            except Exception as e:
+                dict_['title'].append(meta_data['title'].values[0])
+
+            # add the journal information
+            dict_['journal'].append(meta_data['journal'].values[0])
+        #print(dict_)    
+
+        df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'publish_year','url','license','WHO #Covidence','body_text', 'authors', 'title', 'journal', 'abstract_summary'])
+
+        return df_covid
+
+#df_covid.head()
+
+
+#print(df_covid.body_text)