Skip to content

Commit

Permalink
refine paper
Browse files Browse the repository at this point in the history
  • Loading branch information
longbinlai committed Jan 9, 2025
1 parent ea1110a commit 9fd9cfc
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 146 deletions.
21 changes: 0 additions & 21 deletions python/graphy/graph/nodes/pdf_extract_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,6 @@ def execute(
pdf_extractor.compute_links()
pdf_paper_references = list(pdf_extractor.linked_contents)

paper = Paper.from_pdf_metadata(paper_metadata)
paper_dict = paper.to_dict()
else:
paper_dict = paper_metadata

Expand All @@ -90,25 +88,6 @@ def execute(

pdf_extractor.clear()

if self.arxiv_fetch_paper:
paper_title = paper_dict.get("title", "")
if len(paper_title) > 0:
result, bib_text = self.arxiv_fetcher.fetch_paper(paper_title, 5)
if result is None and bib_text is None:
if self.scholar_fetch_paper:
self.scholar_fetcher.set_web_data_folder(
os.path.join(
WF_WEBDATA_DIR,
paper_dict.get("id", f"webdata_{int(time.time())}"),
),
)
result, bib_text = self.scholar_fetcher.fetch_paper(
paper.title, mode="exact"
)
if result:
paper_dict.update(result)
if bib_text is not None:
paper_dict["bib"] = bib_text.replace("\n", "\\n")
logger.debug("=========== PAPER INFO ===============")
logger.debug(paper_dict)

Expand Down
178 changes: 53 additions & 125 deletions python/graphy/utils/paper_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,130 +5,58 @@


class Paper:
"""
An object that defines common paper structure
"""

def __init__(
self,
id: str,
published: datetime,
year: str,
month: str,
title: str,
authors: List[str],
summary: str,
journal_ref: str,
doi: str,
primary_category: str,
categories: List[str],
bib: str,
):
self.id = id
self.published = published
self.year = year
self.month = month
self.title = title
self.authors = authors
self.summary = summary
self.journal_ref = journal_ref
self.doi = doi
self.primary_category = primary_category
self.categories = categories
self.bib = bib

def to_dict(self) -> Dict:
return {
"id": self.id,
"published": self.published.isoformat(),
"year": self.year,
"month": self.month,
"title": self.title,
"authors": self.authors,
"summary": self.summary,
"journal_ref": self.journal_ref,
"doi": self.doi,
"primary_category": self.primary_category,
"categories": self.categories,
"bib": self.bib,
}

@classmethod
def from_dict(cls, data: Dict):
return cls(
id=data["id"],
published=datetime.fromisoformat(data["published"]),
year=data["year"],
month=data["month"],
title=data["title"],
authors=data["authors"],
summary=data["summary"],
journal_ref=data["journal_ref"],
doi=data["doi"],
primary_category=data["primary_category"],
categories=data["categories"],
bib=data["bib"],
)

@classmethod
def from_pdf_metadata(cls, metadata: Dict):
def parse_creation_date(date_str: str) -> datetime:
date_str = date_str[2:] # Remove leading "D:"
date_formats = [
"%Y%m%d%H%M%S%z",
"%Y%m%d%H%M%S",
"%Y%m%d%H%M%S%z00'00'",
"%Y%m%d%H%M%SZ",
"%Y%m%d%H%M%SZ00'00'",
]
# Attempt to match specific format with explicit offset
match = re.match(r"(\d{14})([+-]\d{2})'(\d{2})'", date_str)
if match:
date_part, hour_offset, minute_offset = match.groups()
date_obj = datetime.strptime(date_part, "%Y%m%d%H%M%S")
# Create timezone offset
offset = int(hour_offset) * 60 + int(minute_offset)
if int(hour_offset) < 0:
offset = -offset
return date_obj - timedelta(minutes=offset)
for date_format in date_formats:
try:
return datetime.strptime(date_str, date_format)
except ValueError:
continue
raise ValueError(f"Unknown date format: {date_str}")

def clean_author_name(name: str) -> str:
# Define the allowed characters: alphabets, hyphens, apostrophes, spaces, and special characters
allowed_characters = re.compile(r"[^a-zA-Z\s\-'À-ÖØ-öø-ÿĀ-žḀ-ỿ]")
return allowed_characters.sub("", name)

try:
published_date = parse_creation_date(metadata["creationDate"])
except ValueError as e:
try:
published_date = parse_creation_date(metadata["modDate"])
except ValueError as e:
published_date = datetime.now()

authors_str = metadata.get("author", "")
authors_list = [
clean_author_name(author.strip())
for author in authors_str.replace(" and ", ", ").split(",")
if author
@staticmethod
def header(cls) -> List[str]:
return [
"id",
"title",
"author",
"authors",
"year",
"month",
"published",
"summary",
"primary_category",
"categories",
"doi",
"eprint",
"journal_ref",
"url",
"bib",
"references",
"cited_by",
"cited_by_count",
]

return cls(
id=str(hash(metadata.get("title", "").lower())),
published=published_date,
year=published_date.year,
month=published_date.month,
title=metadata.get("title", ""),
authors=authors_list,
summary="",
journal_ref="",
doi="",
primary_category="",
categories=[],
bib="",
)
@staticmethod
def parse_creation_date(cls, date_str: str) -> datetime:
date_str = date_str[2:] # Remove leading "D:"
date_formats = [
"%Y%m%d%H%M%S%z",
"%Y%m%d%H%M%S",
"%Y%m%d%H%M%S%z00'00'",
"%Y%m%d%H%M%SZ",
"%Y%m%d%H%M%SZ00'00'",
]
# Attempt to match specific format with explicit offset
match = re.match(r"(\d{14})([+-]\d{2})'(\d{2})'", date_str)
if match:
date_part, hour_offset, minute_offset = match.groups()
date_obj = datetime.strptime(date_part, "%Y%m%d%H%M%S")
# Create timezone offset
offset = int(hour_offset) * 60 + int(minute_offset)
if int(hour_offset) < 0:
offset = -offset
return date_obj - timedelta(minutes=offset)
for date_format in date_formats:
try:
return datetime.strptime(date_str, date_format)
except ValueError:
continue
raise ValueError(f"Unknown date format: {date_str}")

@staticmethod
def clean_author_name(name: str) -> str:
# Define the allowed characters: alphabets, hyphens, apostrophes, spaces, and special characters
allowed_characters = re.compile(r"[^a-zA-Z\s\-'À-ÖØ-öø-ÿĀ-žḀ-ỿ]")
return allowed_characters.sub("", name)

0 comments on commit 9fd9cfc

Please sign in to comment.