-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
138 lines (104 loc) · 4.7 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File: scraper.py
# Description: This script defines functions for scraping and processing scientific papers from bioRxiv,
# extracting text and embeddings, and storing the information in a custom database.
# It also performs a keyword search on the obtained data.
# Importing necessary libraries
import os
import pandas as pd
import PyPDF2
import argparse, datetime
from paperscraper.pdf import save_pdf
from paperscraper.get_dumps import biorxiv
from paperscraper.xrxiv.xrxiv_query import XRXivQuery
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
import PyPDF2
from VectorDatabase import Lantern, Fragment, Publication
# OpenAI Setup
# openai.api_key = os.getenv(openai_api_key)
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
"""
Scrapes papers from bioRxiv between the specified dates and saves the metadata in a JSON file.
:param start: Start date for the scraping (format: "YYYY-MM-DD").
:param end: End date for the scraping (format: "YYYY-MM-DD").
:param out_file: Output file to save the metadata in JSON Lines format.
:return: None
"""
def scrapeBiorxiv(start, end, out_file):
filepath = out_file
biorxiv(begin_date=start, end_date=end, save_path=out_file)
retreiveTextFromPdf(filepath)
"""
Retrieves text embeddings from a given text file using OpenAI's language model.
:param fname: Path to the input text file.
:return: A tuple containing text embeddings and the OpenAIEmbeddings instance.
"""
def get_embeddings(fname):
loader = TextLoader(fname)
documents = loader.load()
text_splitter = CharacterTextSplitter(
separator=".", chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
emb = OpenAIEmbeddings()
input_texts = [d.page_content for d in docs]
input_embeddings = emb.embed_documents(input_texts)
text_embeddings = list(zip(input_texts, input_embeddings))
return text_embeddings, emb
"""
Retrieves text from PDF files, extracts embeddings, and stores information in a custom database.
:param inp_file: Path to the input JSON file containing paper metadata.
:return: None
"""
def retreiveTextFromPdf(inp_file):
json = pd.read_json(path_or_buf=inp_file, lines=True)
lantern = Lantern()
for n, doi in enumerate(json['doi']):
paper_data = {'doi': doi}
doi = doi.replace("/", "-")
if lantern.publicationExists(doi):
continue
pdf_dir = './papers/'
if not os.path.exists(pdf_dir):
os.mkdir(pdf_dir)
pdfsavefile = './papers/' + doi + '.pdf'
save_pdf(paper_data, filepath=pdfsavefile)
# creating a pdf reader object
reader = PyPDF2.PdfReader(pdfsavefile)
save_txt_path = 'scrapped_txts/'
if not os.path.exists(save_txt_path):
os.mkdir(save_txt_path)
extract_text = ''
for page in reader.pages:
extract_text += page.extract_text()
txt_file = str('{}.txt'.format(doi))
with open(save_txt_path + txt_file, 'w') as file:
file.write(extract_text)
txt_embs, emb = get_embeddings(save_txt_path + txt_file)
fragments = []
for txt, embs in txt_embs:
fragment = Fragment(doi, 'methods', txt, embs)
fragments.append(fragment)
title = ""
pmc = ""
pubmed = ""
publication = Publication(doi, title, pmc, pubmed, doi)
lantern.insertEmbeddings(fragments)
lantern.insertPublication(publication)
os.remove(pdfsavefile)
if __name__ == "__main__":
# Adding command line arguments for start_date and end_date with default values as the current date
parser = argparse.ArgumentParser(description="Scrape and process scientific papers from bioRxiv.")
parser.add_argument("--start-date", default=str(datetime.date.today()), help="Start date for the scraping (format: 'YYYY-MM-DD').")
parser.add_argument("--end-date", default=str(datetime.date.today()), help="End date for the scraping (format: 'YYYY-MM-DD').")
parser.add_argument("--outfile", default="bio.jsonl", help="Output file to save the metadata in JSON Lines format.")
args = parser.parse_args()
# Calling the scrapeBiorxiv function with command line arguments
scrapeBiorxiv(args.start_date, args.end_date, args.out_file)
# Additional code for keyword search if needed
querier = XRXivQuery(args.out_file)
biology = ['Bioinformatics', 'Molecular Biology', 'Bioengineering', 'Biochemistry']
query = [biology]
querier.search_keywords(query, output_filepath='bio_key.jsonl')