-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1e68bb2
commit 68cb891
Showing
26 changed files
with
10,810 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
from runServer import app | ||
from flask import render_template, request | ||
from bokeh.plotting import figure, output_file, show | ||
from bokeh.embed import components | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.feature_extraction import text | ||
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | ||
from sklearn.decomposition import LatentDirichletAllocation | ||
import scispacy | ||
import spacy | ||
import en_core_sci_lg | ||
from scipy.spatial.distance import cosine | ||
import joblib | ||
from IPython.display import HTML, display | ||
from ipywidgets import interact, Layout, HBox, VBox, Box | ||
import ipywidgets as widgets | ||
from IPython.display import clear_output | ||
from tqdm import tqdm | ||
from os.path import isfile | ||
import seaborn as sb | ||
import matplotlib.pyplot as plt | ||
from joblib import dump , load | ||
|
||
nlp = en_core_sci_lg.load(disable=["tagger", "parser", "ner"]) | ||
nlp.max_length = 2000000 | ||
|
||
|
||
|
||
def spacy_tokenizer(sentence): | ||
return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space or len(word)==1)] | ||
|
||
def print_top_words(model, vectorizer, n_top_words): | ||
feature_names = vectorizer.get_feature_names() | ||
for topic_idx, topic in enumerate(model.components_): | ||
message = "\nTopic #%d: " % topic_idx | ||
message += " ".join([feature_names[i] | ||
for i in topic.argsort()[:-n_top_words - 1:-1]]) | ||
#print(message) | ||
#print() | ||
|
||
def get_k_nearest_docs(doc_dist, k=5, lower=1950, upper=2020, only_covid19=False, get_dist=False): | ||
''' | ||
doc_dist: topic distribution (sums to 1) of one article | ||
Returns the index of the k nearest articles (as by Jensen–Shannon divergence in topic space). | ||
''' | ||
|
||
relevant_time = df_covid.publish_year.between(lower, upper) | ||
#relevant_time = '2019' | ||
if only_covid19: | ||
temp = doc_topic_dist[relevant_time & is_covid19_article] | ||
|
||
else: | ||
temp = doc_topic_dist[relevant_time] | ||
|
||
distances = temp.apply(lambda x: cosine(x, doc_dist), axis=1) | ||
k_nearest = distances[distances != 0].nsmallest(n=k).index | ||
|
||
if get_dist: | ||
k_distances = distances[distances != 0].nsmallest(n=k) | ||
return k_nearest, k_distances | ||
else: | ||
return k_nearest | ||
|
||
def plot_article_dna(paper_id, width=20): | ||
t = df_covid[df_covid.paper_id == paper_id].title.values[0] | ||
doc_topic_dist[df_covid.paper_id == paper_id].T.plot(kind='bar', legend=None, title=t, figsize=(width, 4)) | ||
plt.xlabel('Topic') | ||
|
||
def compare_dnas(paper_id, recommendation_id, width=20): | ||
t = df_covid[df_covid.paper_id == recommendation_id].title.values[0] | ||
temp = doc_topic_dist[df_covid.paper_id == paper_id] | ||
ymax = temp.max(axis=1).values[0]*1.25 | ||
temp = pd.concat([temp, doc_topic_dist[df_covid.paper_id == recommendation_id]]) | ||
temp.T.plot(kind='bar', title=t, figsize=(width, 4), ylim= [0, ymax]) | ||
plt.xlabel('Topic') | ||
plt.legend(['Selection', 'Recommendation']) | ||
|
||
# compare_dnas('90b5ecf991032f3918ad43b252e17d1171b4ea63', 'a137eb51461b4a4ed3980aa5b9cb2f2c1cf0292a') | ||
|
||
def dna_tabs(paper_ids): | ||
k = len(paper_ids) | ||
outs = [widgets.Output() for i in range(k)] | ||
|
||
tab = widgets.Tab(children = outs) | ||
tab_titles = ['Paper ' + str(i+1) for i in range(k)] | ||
for i, t in enumerate(tab_titles): | ||
tab.set_title(i, t) | ||
display(tab) | ||
|
||
for i, t in enumerate(tab_titles): | ||
with outs[i]: | ||
ax = plot_article_dna(paper_ids[i]) | ||
plt.show(ax) | ||
|
||
def compare_tabs(paper_id, recommendation_ids): | ||
k = len(recommendation_ids) | ||
outs = [widgets.Output() for i in range(k)] | ||
|
||
tab = widgets.Tab(children = outs) | ||
tab_titles = ['Paper ' + str(i+1) for i in range(k)] | ||
for i, t in enumerate(tab_titles): | ||
tab.set_title(i, t) | ||
display(tab) | ||
|
||
for i, t in enumerate(tab_titles): | ||
with outs[i]: | ||
ax = compare_dnas(paper_id, recommendation_ids[i]) | ||
plt.show(ax) | ||
|
||
def recommendation(paper_id, k=5, lower=2000, upper=2020, only_covid19=False, plot_dna=False): | ||
''' | ||
Returns the title of the k papers that are closest (topic-wise) to the paper given by paper_id. | ||
''' | ||
|
||
print(df_covid.title[df_covid.paper_id == paper_id].values[0]) | ||
|
||
recommended, dist = get_k_nearest_docs(doc_topic_dist[df_covid.paper_id == paper_id].iloc[0], k, lower, upper, only_covid19, get_dist=True) | ||
recommended = df_covid.iloc[recommended].copy() | ||
recommended['similarity'] = 1 - dist | ||
|
||
|
||
#h = '/n'.join([ n + '/n' +' (Similarity: ' + "{:.2f}".format(s) + ')' for n, s in recommended[['title', 'similarity']].values]) | ||
#display(HTML(h)) | ||
print(recommended[['title', 'similarity']].values) | ||
# for n, s in recommended[['title', 'similarity']].values: | ||
# print(n) | ||
# print(s) | ||
|
||
if plot_dna: | ||
compare_tabs(paper_id, recommended.paper_id.values) | ||
|
||
|
||
def relevant_articles(tasks, k=3, lower=2000, upper=2020, only_covid19=False): | ||
tasks = [tasks] if type(tasks) is str else tasks | ||
|
||
tasks_vectorized = vectorizer.transform(tasks) | ||
tasks_topic_dist = pd.DataFrame(lda.transform(tasks_vectorized)) | ||
|
||
for index, bullet in enumerate(tasks): | ||
print(bullet) | ||
recommended ,dist= get_k_nearest_docs(tasks_topic_dist.iloc[index], k, lower, upper, only_covid19, get_dist=True) | ||
recommended = df_covid.iloc[recommended] | ||
|
||
#recommended = df_covid.iloc[recommended].copy() | ||
recommended['similarity'] = 1 - dist | ||
|
||
print(recommended) | ||
#h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' for l, n in recommended[['url','title']].values]) | ||
#display(HTML(h)) | ||
return recommended | ||
|
||
|
||
|
||
df_covid = load('df_covid.csv') | ||
vectorizer = load('vectorizer.csv') | ||
data_vectorized = load('data_vectorized.csv') | ||
lda = load('lda.csv') | ||
doc_topic_dist = pd.read_csv('doc_topic_dist.csv') | ||
is_covid19_article = df_covid.body_text.str.contains('COVID-19|SARS-CoV-2|2019-nCov|SARS Coronavirus 2|2019 Novel Coronavirus') | ||
|
||
|
||
|
||
@app.route('/') | ||
def hello_world(name = None): | ||
return render_template("home.html") | ||
|
||
@app.route('/predict',methods=['POST','GET']) | ||
def predict(): | ||
searchText = [str(x) for x in request.form.values()][0] | ||
print(searchText) | ||
# | ||
# Model code | ||
# | ||
# | ||
|
||
results = relevant_articles(searchText) | ||
title = [] | ||
sim = [] | ||
content = [] | ||
url = [] | ||
for i,row in results.iterrows(): | ||
abstract = str(row['abstract']) | ||
abstract = abstract.replace("<br>", " ") | ||
content.append(abstract) | ||
sim.append(row['similarity']) | ||
title_ = str(row['title']) | ||
#print(title_) | ||
title_ = title_.replace("<br>", " ") | ||
#print(title_) | ||
title.append(title_) | ||
url.append(row['url']) | ||
#title = ["Pikachu", "Charizard", "Squirtle"] | ||
#content = ["Pikachu", "Charizard", "Squirtle"] | ||
#sim = [10, 40 , 50] | ||
#url = ["Pikachu", "Charizard", "Squirtle"] | ||
return render_template("predict.html", len = len(title), title = title, content = content, sim = sim, url = url, searchText = searchText) | ||
|
||
if __name__ == "__main__": | ||
app.run(host= '127.0.0.1', debug=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import glob | ||
import json | ||
import gc | ||
import matplotlib.pyplot as plt | ||
|
||
plt.style.use('ggplot') | ||
pd.set_option('display.max_columns', 30) | ||
|
||
root_path = '/home/mohamed/Desktop/Codes/Covid-Data' | ||
metadata_path = f'{root_path}/metadata.csv' | ||
meta_df = pd.read_csv(metadata_path, dtype={ | ||
'pubmed_id': str, | ||
'Microsoft Academic Paper ID': str, | ||
'doi': str | ||
}) | ||
|
||
#print(meta_df.head()) | ||
|
||
all_json = glob.glob(f'{root_path}/**/**/*.json', recursive=True) | ||
|
||
#print(len(all_json)) | ||
|
||
class FileReader: | ||
def __init__(self, file_path): | ||
with open(file_path) as file: | ||
content = json.load(file) | ||
#print(content.keys()) | ||
self.paper_id = content['paper_id'] | ||
self.abstract = [] | ||
self.body_text = [] | ||
# Abstract | ||
try : | ||
for entry in content['abstract']: | ||
self.abstract.append(entry['text']) | ||
except Exception as e: | ||
pass | ||
# Body text | ||
for entry in content['body_text']: | ||
self.body_text.append(entry['text']) | ||
self.abstract = '\n'.join(self.abstract) | ||
self.body_text = '\n'.join(self.body_text) | ||
|
||
|
||
def __repr__(self): | ||
return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...' | ||
#first_row = FileReader(all_json[0]) | ||
|
||
#print(first_row) | ||
|
||
class loader(): | ||
def __init__(self): | ||
pass | ||
|
||
@staticmethod | ||
def get_breaks(content, length): | ||
data = "" | ||
words = content.split(' ') | ||
total_chars = 0 | ||
|
||
# add break every length characters | ||
for i in range(len(words)): | ||
total_chars += len(words[i]) | ||
if total_chars > length: | ||
data = data + "<br>" + words[i] | ||
total_chars = 0 | ||
else: | ||
data = data + " " + words[i] | ||
return data | ||
|
||
def load(self,start = 0, end = 1000): | ||
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [],'publish_year':[],'url':[],'WHO #Covidence':[] ,'license':[],'authors': [], 'title': [], 'journal': [], 'abstract_summary': []} | ||
for idx, entry in enumerate(all_json[:end]): | ||
#print(dict_) | ||
if idx % (len(all_json) // 100) == 0: | ||
print(f'Processing index: {idx} of {len(all_json)}') | ||
content = FileReader(entry) | ||
# get metadata information | ||
#print(content.paper_id) | ||
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id] | ||
#print("meta data ", meta_data) | ||
# no metadata, skip this paper | ||
if len(meta_data) == 0: | ||
#print("%%%%%%%%%%%%%%%%%55") | ||
continue | ||
|
||
dict_['paper_id'].append(content.paper_id) | ||
dict_['abstract'].append(content.abstract) | ||
dict_['body_text'].append(content.body_text) | ||
dict_['publish_year'].append(pd.DatetimeIndex(meta_data['publish_time']).year.values[0]) | ||
dict_['url'].append(meta_data['url'].values[0]) | ||
dict_['WHO #Covidence'].append(meta_data['WHO #Covidence'].values[0]) | ||
dict_['license'].append(meta_data['license'].values[0]) | ||
# also create a column for the summary of abstract to be used in a plot | ||
if len(content.abstract) == 0: | ||
# no abstract provided | ||
dict_['abstract_summary'].append("Not provided.") | ||
elif len(content.abstract.split(' ')) > 100: | ||
# abstract provided is too long for plot, take first 300 words append with ... | ||
info = content.abstract.split(' ')[:100] | ||
summary = self.get_breaks(' '.join(info), 40) | ||
dict_['abstract_summary'].append(summary + "...") | ||
else: | ||
# abstract is short enough | ||
summary = self.get_breaks(content.abstract, 40) | ||
dict_['abstract_summary'].append(summary) | ||
|
||
# get metadata information | ||
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id] | ||
|
||
try: | ||
# if more than one author | ||
authors = meta_data['authors'].values[0].split(';') | ||
if len(authors) > 2: | ||
# more than 2 authors, may be problem when plotting, so take first 2 append with ... | ||
dict_['authors'].append(". ".join(authors[:2]) + "...") | ||
else: | ||
# authors will fit in plot | ||
dict_['authors'].append(". ".join(authors)) | ||
except Exception as e: | ||
# if only one author - or Null valie | ||
dict_['authors'].append(meta_data['authors'].values[0]) | ||
|
||
# add the title information, add breaks when needed | ||
try: | ||
title = self.get_breaks(meta_data['title'].values[0], 40) | ||
dict_['title'].append(title) | ||
# if title was not provided | ||
except Exception as e: | ||
dict_['title'].append(meta_data['title'].values[0]) | ||
|
||
# add the journal information | ||
dict_['journal'].append(meta_data['journal'].values[0]) | ||
#print(dict_) | ||
|
||
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'publish_year','url','license','WHO #Covidence','body_text', 'authors', 'title', 'journal', 'abstract_summary']) | ||
|
||
return df_covid | ||
|
||
#df_covid.head() | ||
|
||
|
||
#print(df_covid.body_text) |
Oops, something went wrong.