Skip to content

Commit

Permalink
better title_hash
Browse files Browse the repository at this point in the history
  • Loading branch information
yindaheng98 committed Sep 5, 2024
1 parent 28acdef commit b9ce6ec
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 2 deletions.
6 changes: 5 additions & 1 deletion dblp_crawler/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
logger = logging.getLogger("parser")


def title_hash(title):
return re.sub(r"[^0-9a-z]", "", title.lower()) or title.lower()


class Person:
def __init__(self, data: ElementTree.Element) -> None:
assert data.tag == "person", "Should be xml of a person in dblpperson!"
Expand Down Expand Up @@ -74,7 +78,7 @@ def title(self) -> str:
return " ".join(t for t in child.itertext())

def title_hash(self) -> str:
return re.sub(r"[^0-9a-z\u4e00-\u9fa5]", "", self.title().lower())
return title_hash(self.title())

def journal(self) -> Optional[str]:
tag = {
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setup(
name='dblp_crawler',
version='2.1.5',
version='2.1.6',
author='yindaheng98',
author_email='[email protected]',
url='https://github.com/yindaheng98/dblp-crawler',
Expand Down

0 comments on commit b9ce6ec

Please sign in to comment.