Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(graphy): Fix bug of retrieving more than one result from arxiv #634

Merged
merged 2 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions python/graphy/apps/paper_reading/paper_navigate_edge.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def execute(
link_queue = Queue()
output_queue = Queue()

logger.info("================= START NAVIGATE ==============")
logger.warning(f"================= START NAVIGATE ==============")
paper_data_id_list = []
for paper in input:
if not paper:
Expand Down Expand Up @@ -379,7 +379,7 @@ def download_worker(self, link_queue):
download_folder=self.paper_download_dir,
meta_folder=self.meta_folder_dir,
)
download_list = arxiv_fetcher.download_paper(link, 1)
download_list = arxiv_fetcher.download_paper(link, 5)

if len(download_list) == 0:
logger.info(f"PASS {link} to SCHOLAR FOR FURTHER SEARCH")
Expand Down Expand Up @@ -493,7 +493,7 @@ def download_worker(self, scholar_link_queue):
scholar_link_queue.task_done()
continue

logger.info(
logger.error(
f"-------------- SCHOLAR DOWNLOAD WORKER: {link} ------------------"
)

Expand Down Expand Up @@ -525,7 +525,7 @@ def download_worker(self, scholar_link_queue):

scholar_link_queue.task_done()

logger.info(
logger.error(
f"-------------- FINISH SCHOLAR WORKER: {link} ------------------"
)

Expand Down
1 change: 1 addition & 0 deletions python/graphy/apps/paper_reading/paper_reading_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ def execute(

if not paper_file_path:
logger.error("No 'paper_file_path' provided in input data.")
logger.error(f"create fake extractor {paper_meta_path}")
if not paper_meta_path:
continue
try:
Expand Down
2 changes: 2 additions & 0 deletions python/graphy/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .paper_struct import Paper
from .timer import Timer
from .bib_search import BibSearchGoogleScholar, BibSearchArxiv, BibSearchPubMed
from .string_similarity import StringSimilarity

from .json_parser import (
JsonParserType,
Expand All @@ -30,4 +31,5 @@
"BibSearchGoogleScholar",
"BibSearchArxiv",
"BibSearchPubMed",
"StringSimilarity",
]
2 changes: 1 addition & 1 deletion python/graphy/utils/arxiv_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(
:param timeout: The maximum time (in seconds) allowed for each paper fetching operation.
:param download_folder: The folder where the fetched papers will be downloaded.
"""
self.client = arxiv.Client(delay_seconds=0.2, page_size=3, num_retries=1)
self.client = arxiv.Client(delay_seconds=0.2, page_size=5, num_retries=1)
self.timeout = timeout
self.download_folder = download_folder
self.bib_search_arxiv = BibSearchArxiv(
Expand Down
129 changes: 94 additions & 35 deletions python/graphy/utils/bib_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import threading

from google_scholar_py import CustomGoogleScholarOrganic
from .string_similarity import StringSimilarity

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,7 +72,10 @@ def __init__(self, persist_store=None, web_data_folder="", meta_folder="") -> No

self.web_data_folder = web_data_folder

self.request_interval = 5
self.request_interval = 10

def _formulate_query(self, query):
return re.sub(r"[^a-zA-Z0-9, ]", "_", query.strip())

def safe_request(self, driver, link):
with BibSearchGoogleScholar.google_scholar_request_lock:
Expand All @@ -80,13 +84,13 @@ def safe_request(self, driver, link):
interval = time.time() - BibSearchGoogleScholar.last_request_google_scholar
if interval < self.request_interval:
time_to_wait = (
random.uniform(self.request_interval, self.request_interval + 5)
random.uniform(self.request_interval, self.request_interval + 6)
- interval
)

time.sleep(time_to_wait)

logger.info(f"Time Issues: {time.time()} - {time_to_wait} {link}")
logger.warning(f"Time Issues: {time.time()} - {time_to_wait} {link}")

driver.get(link)

Expand Down Expand Up @@ -429,7 +433,8 @@ def _get_cited_by_paper_names(self, driver, link, max_results=50):
if page_num == 0:
refined_link = f"{link}"
else:
refined_link = f"{link_header}?start={str(page_num)}&{link_params['hl']}&{link_params['as_sdt']}&{link_params['sciodt']}&{link_params['cites']}&scipsc="
refined_link = f"{link_header}?start={str(page_num)}&{link_params['hl']}&{link_params['as_sdt']}&{link_params['sciodt']}&{link_params['cites']}"
# refined_link = f"{link_header}?start={str(page_num)}&{link_params['hl']}&{link_params['as_sdt']}&{link_params['sciodt']}&{link_params['cites']}&scipsc="

driver = self.safe_request(
driver=driver,
Expand All @@ -451,8 +456,9 @@ def _get_cited_by_paper_names(self, driver, link, max_results=50):
if (
"not a robot" in driver.page_source
or "may be sending automated queries" in driver.page_source
or "您的计算机网络中存在异常流量" in driver.page_source
):
logger.error("Detected as a spider")
logger.error("============== DETECTED AS A SPIDER ===============")
parser = LexborHTMLParser(driver.page_source)

if get_content:
Expand Down Expand Up @@ -496,9 +502,21 @@ def parse(
title = str(time.time())

if mode == "exact":
similarity = difflib.SequenceMatcher(
None, title.lower(), query.lower()
).ratio()
if self._formulate_query(title).lower() in query.lower():
similarity = 1
else:
similarity = StringSimilarity.ratio_similarity(
title.lower(), query.lower()
)
# similarity = StringSimilarity.semantic_similarity(
# title.lower(), query.lower()
# )
# similarity = difflib.SequenceMatcher(
# None, title.lower(), query.lower()
# ).ratio()
similarity = StringSimilarity.semantic_similarity(
title.lower(), query.lower()
)
logger.info(
f"Scholar compared with: {query}, Found paper: {title} with similarity {similarity}"
)
Expand All @@ -524,9 +542,11 @@ def parse(
outputs.append(this_bib)

elif action == "download":
logger.error("start to download")
succ, file_path, file_name, exist = self.download(
driver, title, result, download_path
)
logger.error("finish to download")

if not succ and not exist:
logger.warning(f"Found {title}, but download failed.")
Expand Down Expand Up @@ -555,7 +575,6 @@ def parse(
driver, title, result, cite_directory
)

logger.error(f"already get bib: {this_bib}")
if (
"cited_by_link" in this_bib
and this_bib["cited_by_link"] is not None
Expand All @@ -564,7 +583,6 @@ def parse(
this_bib["cited_by"] = self._get_cited_by_paper_names(
driver, this_bib["cited_by_link"]
)
logger.error(f"finish use this bib")
except Exception as e:
if this_bib is None:
meta_file_path = None
Expand All @@ -574,10 +592,14 @@ def parse(
self.persist_store.save_state(
self.meta_folder, file_name, this_bib
)
if meta_file_path:
if succ and meta_file_path:
outputs.append((True, file_path, meta_file_path, exist))
else:
outputs.append((succ, file_path, meta_file_path, exist))
elif succ and not meta_file_path:
outputs.append((True, file_path, meta_file_path, exist))
elif not succ and meta_file_path:
outputs.append((True, None, meta_file_path, exist))
elif not succ and not meta_file_path:
outputs.append((False, None, meta_file_path, exist))

if len(outputs) >= num_per_page:
break
Expand Down Expand Up @@ -721,7 +743,7 @@ def search_by_name(

page_num = 0
organic_results_data = []
pruned_query = re.sub(r"[^a-zA-Z0-9, ]", "_", query.strip())
pruned_query = self._formulate_query(query)
logger.info(f"pruned query {pruned_query}")

# parse all pages
Expand Down Expand Up @@ -800,7 +822,8 @@ def download(self, driver, title, result, download_path):
pdf_link: str = result.css_first(".gs_or_ggsm a").attrs["href"]

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
"referer": "https://scholar.google.com/",
}
response = requests.get(pdf_link, headers=headers)

Expand All @@ -811,18 +834,19 @@ def download(self, driver, title, result, download_path):
return True, file_path, scholar_name, False
else:
logger.warning(
f"Failed to download. Status code: {response.status_code}"
f"Failed to download. Status code: {response.status_code}. Try to fix ..."
)

with open("fail_log.log", "a") as f:
f.write(file_path + "\n")
f.write(pdf_link + "\n")
# f.write(pdf_link + "\n")
f.write("STATUS CODE: " + str(response.status_code) + "\n")
f.write("\n")
except Exception as e:
logger.error(f"Download failed: {e}")
with open("fail_log.log", "a") as f:
f.write(file_path + "\n")
f.write(pdf_link + "\n")
# f.write(pdf_link + "\n")
f.write(str(e) + "\n")
f.write("\n")

Expand All @@ -837,9 +861,18 @@ def condition_met(driver):
except Exception:
element_present = False

if not element_present:
try:
element_present = EC.presence_of_element_located(
(By.ID, "gs_res_ccl_mid")
)(driver)
except Exception:
element_present = False

text_present = (
"not a robot" in driver.page_source
or "may be sending automated queries" in driver.page_source
or "您的计算机网络中存在异常流量" in driver.page_source
)

return element_present or text_present
Expand Down Expand Up @@ -876,7 +909,7 @@ def download_by_name(
if pagination:
while page_num <= 10:
# parse all pages (the first two pages)
pruned_query = re.sub(r"[^a-zA-Z0-9, ]", "_", query.strip())
pruned_query = self._formulate_query(query)
driver = self.safe_request(
driver=driver,
link=f"https://scholar.google.com/scholar?q={pruned_query}&hl=en&gl=us&start={page_num}",
Expand Down Expand Up @@ -911,27 +944,53 @@ def download_by_name(
else:
# parse first page only
# logger.error("### START TO DOWNLOAD #####")
pruned_query = re.sub(r"[^a-zA-Z0-9, ]", "_", query.strip())
pruned_query = self._formulate_query(query)
# logger.error(pruned_query)

driver = self.safe_request(
driver=driver,
link=f"https://scholar.google.com/scholar?q={pruned_query}&hl=en&gl=us&start={page_num}",
)

WebDriverWait(driver, 10).until(self.finish_load_condition())
retry_times = 0
max_retry_times = 3

parser = LexborHTMLParser(driver.page_source)
while retry_times <= max_retry_times:
retry_times += 1
driver = self.safe_request(
driver=driver,
link=f"https://scholar.google.com/scholar?q={pruned_query}&hl=zh-cn&gl=us&as_sdt=0,5&start={page_num}",
)

if len(parser.css(".gs_r.gs_or.gs_scl")) == 0:
if "not a robot" in driver.page_source:
logger.error(
f"============== DETECTED AS A ROBOT {query} ============="
try:
WebDriverWait(driver, 10).until(
self.finish_load_condition()
)
# with open("fail_log.log", "a") as f:
# f.write(query + "\n")
# f.write("no label\n")
# f.write(driver.page_source)
except TimeoutException as e:
logger.error(f"Cannot Get Cited by Timeout Error: {e}")
except Exception as e:
logger.error(f"Cannot Get Cited by Error: {e}")

parser = LexborHTMLParser(driver.page_source)

if len(parser.css(".gs_r.gs_or.gs_scl")) == 0:
if (
"not a robot" in driver.page_source
or "may be sending automated queries"
in driver.page_source
or "您的计算机网络中存在异常流量" in driver.page_source
):
logger.error(
f"============== DETECTED AS A ROBOT {query} ============="
)
logger.error(
f"https://scholar.google.com/scholar?q={pruned_query}&hl=zh-cn&gl=us&start={page_num}"
)
logger.error(
f"===== TO RETRY {retry_times}/{max_retry_times}"
)
time.sleep(random.uniform(8, 15))
# with open("fail_log.log", "a") as f:
# f.write(query + "\n")
# f.write("no label\n")
# f.write(driver.page_source)
else:
break

succ_list = self.parse(
driver, query, parser, mode, "download", download_path
Expand Down
Loading
Loading