refine paper

GraphScope · Jan 9, 2025 · 9fd9cfc · 9fd9cfc
1 parent ea1110a
commit 9fd9cfc
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 146 deletions.
diff --git a/python/graphy/graph/nodes/pdf_extract_node.py b/python/graphy/graph/nodes/pdf_extract_node.py
@@ -78,8 +78,6 @@ def execute(
                 pdf_extractor.compute_links()
                 pdf_paper_references = list(pdf_extractor.linked_contents)
 
-            paper = Paper.from_pdf_metadata(paper_metadata)
-            paper_dict = paper.to_dict()
         else:
             paper_dict = paper_metadata
 
@@ -90,25 +88,6 @@ def execute(
 
         pdf_extractor.clear()
 
-        if self.arxiv_fetch_paper:
-            paper_title = paper_dict.get("title", "")
-            if len(paper_title) > 0:
-                result, bib_text = self.arxiv_fetcher.fetch_paper(paper_title, 5)
-                if result is None and bib_text is None:
-                    if self.scholar_fetch_paper:
-                        self.scholar_fetcher.set_web_data_folder(
-                            os.path.join(
-                                WF_WEBDATA_DIR,
-                                paper_dict.get("id", f"webdata_{int(time.time())}"),
-                            ),
-                        )
-                        result, bib_text = self.scholar_fetcher.fetch_paper(
-                            paper.title, mode="exact"
-                        )
-                if result:
-                    paper_dict.update(result)
-                if bib_text is not None:
-                    paper_dict["bib"] = bib_text.replace("\n", "\\n")
         logger.debug("=========== PAPER INFO ===============")
         logger.debug(paper_dict)
 

diff --git a/python/graphy/utils/paper_struct.py b/python/graphy/utils/paper_struct.py
@@ -5,130 +5,58 @@
 
 
 class Paper:
-    """
-    An object that defines common paper structure
-    """
-
-    def __init__(
-        self,
-        id: str,
-        published: datetime,
-        year: str,
-        month: str,
-        title: str,
-        authors: List[str],
-        summary: str,
-        journal_ref: str,
-        doi: str,
-        primary_category: str,
-        categories: List[str],
-        bib: str,
-    ):
-        self.id = id
-        self.published = published
-        self.year = year
-        self.month = month
-        self.title = title
-        self.authors = authors
-        self.summary = summary
-        self.journal_ref = journal_ref
-        self.doi = doi
-        self.primary_category = primary_category
-        self.categories = categories
-        self.bib = bib
-
-    def to_dict(self) -> Dict:
-        return {
-            "id": self.id,
-            "published": self.published.isoformat(),
-            "year": self.year,
-            "month": self.month,
-            "title": self.title,
-            "authors": self.authors,
-            "summary": self.summary,
-            "journal_ref": self.journal_ref,
-            "doi": self.doi,
-            "primary_category": self.primary_category,
-            "categories": self.categories,
-            "bib": self.bib,
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict):
-        return cls(
-            id=data["id"],
-            published=datetime.fromisoformat(data["published"]),
-            year=data["year"],
-            month=data["month"],
-            title=data["title"],
-            authors=data["authors"],
-            summary=data["summary"],
-            journal_ref=data["journal_ref"],
-            doi=data["doi"],
-            primary_category=data["primary_category"],
-            categories=data["categories"],
-            bib=data["bib"],
-        )
-
-    @classmethod
-    def from_pdf_metadata(cls, metadata: Dict):
-        def parse_creation_date(date_str: str) -> datetime:
-            date_str = date_str[2:]  # Remove leading "D:"
-            date_formats = [
-                "%Y%m%d%H%M%S%z",
-                "%Y%m%d%H%M%S",
-                "%Y%m%d%H%M%S%z00'00'",
-                "%Y%m%d%H%M%SZ",
-                "%Y%m%d%H%M%SZ00'00'",
-            ]
-            # Attempt to match specific format with explicit offset
-            match = re.match(r"(\d{14})([+-]\d{2})'(\d{2})'", date_str)
-            if match:
-                date_part, hour_offset, minute_offset = match.groups()
-                date_obj = datetime.strptime(date_part, "%Y%m%d%H%M%S")
-                # Create timezone offset
-                offset = int(hour_offset) * 60 + int(minute_offset)
-                if int(hour_offset) < 0:
-                    offset = -offset
-                return date_obj - timedelta(minutes=offset)
-            for date_format in date_formats:
-                try:
-                    return datetime.strptime(date_str, date_format)
-                except ValueError:
-                    continue
-            raise ValueError(f"Unknown date format: {date_str}")
-
-        def clean_author_name(name: str) -> str:
-            # Define the allowed characters: alphabets, hyphens, apostrophes, spaces, and special characters
-            allowed_characters = re.compile(r"[^a-zA-Z\s\-'À-ÖØ-öø-ÿĀ-žḀ-ỿ]")
-            return allowed_characters.sub("", name)
-
-        try:
-            published_date = parse_creation_date(metadata["creationDate"])
-        except ValueError as e:
-            try:
-                published_date = parse_creation_date(metadata["modDate"])
-            except ValueError as e:
-                published_date = datetime.now()
-
-        authors_str = metadata.get("author", "")
-        authors_list = [
-            clean_author_name(author.strip())
-            for author in authors_str.replace(" and ", ", ").split(",")
-            if author
+    @staticmethod
+    def header(cls) -> List[str]:
+        return [
+            "id",
+            "title",
+            "author",
+            "authors",
+            "year",
+            "month",
+            "published",
+            "summary",
+            "primary_category",
+            "categories",
+            "doi",
+            "eprint",
+            "journal_ref",
+            "url",
+            "bib",
+            "references",
+            "cited_by",
+            "cited_by_count",
         ]
 
-        return cls(
-            id=str(hash(metadata.get("title", "").lower())),
-            published=published_date,
-            year=published_date.year,
-            month=published_date.month,
-            title=metadata.get("title", ""),
-            authors=authors_list,
-            summary="",
-            journal_ref="",
-            doi="",
-            primary_category="",
-            categories=[],
-            bib="",
-        )
+    @staticmethod
+    def parse_creation_date(cls, date_str: str) -> datetime:
+        date_str = date_str[2:]  # Remove leading "D:"
+        date_formats = [
+            "%Y%m%d%H%M%S%z",
+            "%Y%m%d%H%M%S",
+            "%Y%m%d%H%M%S%z00'00'",
+            "%Y%m%d%H%M%SZ",
+            "%Y%m%d%H%M%SZ00'00'",
+        ]
+        # Attempt to match specific format with explicit offset
+        match = re.match(r"(\d{14})([+-]\d{2})'(\d{2})'", date_str)
+        if match:
+            date_part, hour_offset, minute_offset = match.groups()
+            date_obj = datetime.strptime(date_part, "%Y%m%d%H%M%S")
+            # Create timezone offset
+            offset = int(hour_offset) * 60 + int(minute_offset)
+            if int(hour_offset) < 0:
+                offset = -offset
+            return date_obj - timedelta(minutes=offset)
+        for date_format in date_formats:
+            try:
+                return datetime.strptime(date_str, date_format)
+            except ValueError:
+                continue
+        raise ValueError(f"Unknown date format: {date_str}")
+
+    @staticmethod
+    def clean_author_name(name: str) -> str:
+        # Define the allowed characters: alphabets, hyphens, apostrophes, spaces, and special characters
+        allowed_characters = re.compile(r"[^a-zA-Z\s\-'À-ÖØ-öø-ÿĀ-žḀ-ỿ]")
+        return allowed_characters.sub("", name)