Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Youssef-Matloob authored May 28, 2020
1 parent 1e68bb2 commit 68cb891
Show file tree
Hide file tree
Showing 26 changed files with 10,810 additions and 0 deletions.
Binary file added __pycache__/runServer.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/runServer.cpython-37.pyc
Binary file not shown.
201 changes: 201 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
from runServer import app
from flask import render_template, request
from bokeh.plotting import figure, output_file, show
from bokeh.embed import components
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import scispacy
import spacy
import en_core_sci_lg
from scipy.spatial.distance import cosine
import joblib
from IPython.display import HTML, display
from ipywidgets import interact, Layout, HBox, VBox, Box
import ipywidgets as widgets
from IPython.display import clear_output
from tqdm import tqdm
from os.path import isfile
import seaborn as sb
import matplotlib.pyplot as plt
from joblib import dump , load

nlp = en_core_sci_lg.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 2000000



def spacy_tokenizer(sentence):
return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space or len(word)==1)]

def print_top_words(model, vectorizer, n_top_words):
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
message = "\nTopic #%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
#print(message)
#print()

def get_k_nearest_docs(doc_dist, k=5, lower=1950, upper=2020, only_covid19=False, get_dist=False):
'''
doc_dist: topic distribution (sums to 1) of one article
Returns the index of the k nearest articles (as by Jensen–Shannon divergence in topic space).
'''

relevant_time = df_covid.publish_year.between(lower, upper)
#relevant_time = '2019'
if only_covid19:
temp = doc_topic_dist[relevant_time & is_covid19_article]

else:
temp = doc_topic_dist[relevant_time]

distances = temp.apply(lambda x: cosine(x, doc_dist), axis=1)
k_nearest = distances[distances != 0].nsmallest(n=k).index

if get_dist:
k_distances = distances[distances != 0].nsmallest(n=k)
return k_nearest, k_distances
else:
return k_nearest

def plot_article_dna(paper_id, width=20):
t = df_covid[df_covid.paper_id == paper_id].title.values[0]
doc_topic_dist[df_covid.paper_id == paper_id].T.plot(kind='bar', legend=None, title=t, figsize=(width, 4))
plt.xlabel('Topic')

def compare_dnas(paper_id, recommendation_id, width=20):
t = df_covid[df_covid.paper_id == recommendation_id].title.values[0]
temp = doc_topic_dist[df_covid.paper_id == paper_id]
ymax = temp.max(axis=1).values[0]*1.25
temp = pd.concat([temp, doc_topic_dist[df_covid.paper_id == recommendation_id]])
temp.T.plot(kind='bar', title=t, figsize=(width, 4), ylim= [0, ymax])
plt.xlabel('Topic')
plt.legend(['Selection', 'Recommendation'])

# compare_dnas('90b5ecf991032f3918ad43b252e17d1171b4ea63', 'a137eb51461b4a4ed3980aa5b9cb2f2c1cf0292a')

def dna_tabs(paper_ids):
k = len(paper_ids)
outs = [widgets.Output() for i in range(k)]

tab = widgets.Tab(children = outs)
tab_titles = ['Paper ' + str(i+1) for i in range(k)]
for i, t in enumerate(tab_titles):
tab.set_title(i, t)
display(tab)

for i, t in enumerate(tab_titles):
with outs[i]:
ax = plot_article_dna(paper_ids[i])
plt.show(ax)

def compare_tabs(paper_id, recommendation_ids):
k = len(recommendation_ids)
outs = [widgets.Output() for i in range(k)]

tab = widgets.Tab(children = outs)
tab_titles = ['Paper ' + str(i+1) for i in range(k)]
for i, t in enumerate(tab_titles):
tab.set_title(i, t)
display(tab)

for i, t in enumerate(tab_titles):
with outs[i]:
ax = compare_dnas(paper_id, recommendation_ids[i])
plt.show(ax)

def recommendation(paper_id, k=5, lower=2000, upper=2020, only_covid19=False, plot_dna=False):
'''
Returns the title of the k papers that are closest (topic-wise) to the paper given by paper_id.
'''

print(df_covid.title[df_covid.paper_id == paper_id].values[0])

recommended, dist = get_k_nearest_docs(doc_topic_dist[df_covid.paper_id == paper_id].iloc[0], k, lower, upper, only_covid19, get_dist=True)
recommended = df_covid.iloc[recommended].copy()
recommended['similarity'] = 1 - dist


#h = '/n'.join([ n + '/n' +' (Similarity: ' + "{:.2f}".format(s) + ')' for n, s in recommended[['title', 'similarity']].values])
#display(HTML(h))
print(recommended[['title', 'similarity']].values)
# for n, s in recommended[['title', 'similarity']].values:
# print(n)
# print(s)

if plot_dna:
compare_tabs(paper_id, recommended.paper_id.values)


def relevant_articles(tasks, k=3, lower=2000, upper=2020, only_covid19=False):
tasks = [tasks] if type(tasks) is str else tasks

tasks_vectorized = vectorizer.transform(tasks)
tasks_topic_dist = pd.DataFrame(lda.transform(tasks_vectorized))

for index, bullet in enumerate(tasks):
print(bullet)
recommended ,dist= get_k_nearest_docs(tasks_topic_dist.iloc[index], k, lower, upper, only_covid19, get_dist=True)
recommended = df_covid.iloc[recommended]

#recommended = df_covid.iloc[recommended].copy()
recommended['similarity'] = 1 - dist

print(recommended)
#h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' for l, n in recommended[['url','title']].values])
#display(HTML(h))
return recommended



df_covid = load('df_covid.csv')
vectorizer = load('vectorizer.csv')
data_vectorized = load('data_vectorized.csv')
lda = load('lda.csv')
doc_topic_dist = pd.read_csv('doc_topic_dist.csv')
is_covid19_article = df_covid.body_text.str.contains('COVID-19|SARS-CoV-2|2019-nCov|SARS Coronavirus 2|2019 Novel Coronavirus')



@app.route('/')
def hello_world(name = None):
return render_template("home.html")

@app.route('/predict',methods=['POST','GET'])
def predict():
searchText = [str(x) for x in request.form.values()][0]
print(searchText)
#
# Model code
#
#

results = relevant_articles(searchText)
title = []
sim = []
content = []
url = []
for i,row in results.iterrows():
abstract = str(row['abstract'])
abstract = abstract.replace("<br>", " ")
content.append(abstract)
sim.append(row['similarity'])
title_ = str(row['title'])
#print(title_)
title_ = title_.replace("<br>", " ")
#print(title_)
title.append(title_)
url.append(row['url'])
#title = ["Pikachu", "Charizard", "Squirtle"]
#content = ["Pikachu", "Charizard", "Squirtle"]
#sim = [10, 40 , 50]
#url = ["Pikachu", "Charizard", "Squirtle"]
return render_template("predict.html", len = len(title), title = title, content = content, sim = sim, url = url, searchText = searchText)

if __name__ == "__main__":
app.run(host= '127.0.0.1', debug=True)
144 changes: 144 additions & 0 deletions loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import numpy as np
import pandas as pd
import glob
import json
import gc
import matplotlib.pyplot as plt

plt.style.use('ggplot')
pd.set_option('display.max_columns', 30)

root_path = '/home/mohamed/Desktop/Codes/Covid-Data'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
'pubmed_id': str,
'Microsoft Academic Paper ID': str,
'doi': str
})

#print(meta_df.head())

all_json = glob.glob(f'{root_path}/**/**/*.json', recursive=True)

#print(len(all_json))

class FileReader:
def __init__(self, file_path):
with open(file_path) as file:
content = json.load(file)
#print(content.keys())
self.paper_id = content['paper_id']
self.abstract = []
self.body_text = []
# Abstract
try :
for entry in content['abstract']:
self.abstract.append(entry['text'])
except Exception as e:
pass
# Body text
for entry in content['body_text']:
self.body_text.append(entry['text'])
self.abstract = '\n'.join(self.abstract)
self.body_text = '\n'.join(self.body_text)


def __repr__(self):
return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
#first_row = FileReader(all_json[0])

#print(first_row)

class loader():
def __init__(self):
pass

@staticmethod
def get_breaks(content, length):
data = ""
words = content.split(' ')
total_chars = 0

# add break every length characters
for i in range(len(words)):
total_chars += len(words[i])
if total_chars > length:
data = data + "<br>" + words[i]
total_chars = 0
else:
data = data + " " + words[i]
return data

def load(self,start = 0, end = 1000):
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [],'publish_year':[],'url':[],'WHO #Covidence':[] ,'license':[],'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json[:end]):
#print(dict_)
if idx % (len(all_json) // 100) == 0:
print(f'Processing index: {idx} of {len(all_json)}')
content = FileReader(entry)
# get metadata information
#print(content.paper_id)
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
#print("meta data ", meta_data)
# no metadata, skip this paper
if len(meta_data) == 0:
#print("%%%%%%%%%%%%%%%%%55")
continue

dict_['paper_id'].append(content.paper_id)
dict_['abstract'].append(content.abstract)
dict_['body_text'].append(content.body_text)
dict_['publish_year'].append(pd.DatetimeIndex(meta_data['publish_time']).year.values[0])
dict_['url'].append(meta_data['url'].values[0])
dict_['WHO #Covidence'].append(meta_data['WHO #Covidence'].values[0])
dict_['license'].append(meta_data['license'].values[0])
# also create a column for the summary of abstract to be used in a plot
if len(content.abstract) == 0:
# no abstract provided
dict_['abstract_summary'].append("Not provided.")
elif len(content.abstract.split(' ')) > 100:
# abstract provided is too long for plot, take first 300 words append with ...
info = content.abstract.split(' ')[:100]
summary = self.get_breaks(' '.join(info), 40)
dict_['abstract_summary'].append(summary + "...")
else:
# abstract is short enough
summary = self.get_breaks(content.abstract, 40)
dict_['abstract_summary'].append(summary)

# get metadata information
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

try:
# if more than one author
authors = meta_data['authors'].values[0].split(';')
if len(authors) > 2:
# more than 2 authors, may be problem when plotting, so take first 2 append with ...
dict_['authors'].append(". ".join(authors[:2]) + "...")
else:
# authors will fit in plot
dict_['authors'].append(". ".join(authors))
except Exception as e:
# if only one author - or Null valie
dict_['authors'].append(meta_data['authors'].values[0])

# add the title information, add breaks when needed
try:
title = self.get_breaks(meta_data['title'].values[0], 40)
dict_['title'].append(title)
# if title was not provided
except Exception as e:
dict_['title'].append(meta_data['title'].values[0])

# add the journal information
dict_['journal'].append(meta_data['journal'].values[0])
#print(dict_)

df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'publish_year','url','license','WHO #Covidence','body_text', 'authors', 'title', 'journal', 'abstract_summary'])

return df_covid

#df_covid.head()


#print(df_covid.body_text)
Loading

0 comments on commit 68cb891

Please sign in to comment.