Skip to content

Commit

Permalink
fixup ignore br and spaces after timestamp
Browse files Browse the repository at this point in the history
  • Loading branch information
dracos committed Apr 23, 2024
1 parent 2a424a6 commit 042d2ad
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,14 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
while next_sibling:
if isinstance(next_sibling, NavigableString) and str(next_sibling).strip():
nt = soup.new_tag("timestamp")
nt.string = next_sibling.strip()
# replace all weird whitespace with normal space
nt.string = nt.string.replace("\xa0", " ")
nt.next_sibling = next_sibling.next_sibling
nt.string = next_sibling.replace("\xa0", " ").strip()
ns = next_sibling.next_sibling
while ns and (ns.name == 'br' or (isinstance(ns, NavigableString) and not ns.strip())):
next_ns = ns.next_sibling
ns.extract()
ns = next_ns
nt.next_sibling = ns
# delete original navigablestring
next_sibling.extract()
next_sibling = nt
Expand Down

0 comments on commit 042d2ad

Please sign in to comment.