forked from langchain-ai/chat-langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
110 lines (98 loc) · 4.78 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re
from typing import Generator
from bs4 import BeautifulSoup, Doctype, NavigableString, Tag
def langchain_docs_extractor(soup: BeautifulSoup) -> str:
# Remove all the tags that are not meaningful for the extraction.
SCAPE_TAGS = ["nav", "footer", "aside", "script", "style"]
[tag.decompose() for tag in soup.find_all(SCAPE_TAGS)]
def get_text(tag: Tag) -> Generator[str, None, None]:
for child in tag.children:
if isinstance(child, Doctype):
continue
if isinstance(child, NavigableString):
yield child
elif isinstance(child, Tag):
if child.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
yield f"{'#' * int(child.name[1:])} {child.get_text()}\n\n"
elif child.name == "a":
yield f"[{child.get_text(strip=False)}]({child.get('href')})"
elif child.name == "img":
yield f"![{child.get('alt', '')}]({child.get('src')})"
elif child.name in ["strong", "b"]:
yield f"**{child.get_text(strip=False)}**"
elif child.name in ["em", "i"]:
yield f"_{child.get_text(strip=False)}_"
elif child.name == "br":
yield "\n"
elif child.name == "code":
parent = child.find_parent()
if parent is not None and parent.name == "pre":
classes = parent.attrs.get("class", "")
language = next(
filter(lambda x: re.match(r"language-\w+", x), classes),
None,
)
if language is None:
language = ""
else:
language = language.split("-")[1]
lines: list[str] = []
for span in child.find_all("span", class_="token-line"):
line_content = "".join(
token.get_text() for token in span.find_all("span")
)
lines.append(line_content)
code_content = "\n".join(lines)
yield f"```{language}\n{code_content}\n```\n\n"
else:
yield f"`{child.get_text(strip=False)}`"
elif child.name == "p":
yield from get_text(child)
yield "\n\n"
elif child.name == "ul":
for li in child.find_all("li", recursive=False):
yield "- "
yield from get_text(li)
yield "\n\n"
elif child.name == "ol":
for i, li in enumerate(child.find_all("li", recursive=False)):
yield f"{i + 1}. "
yield from get_text(li)
yield "\n\n"
elif child.name == "div" and "tabs-container" in child.attrs.get(
"class", [""]
):
tabs = child.find_all("li", {"role": "tab"})
tab_panels = child.find_all("div", {"role": "tabpanel"})
for tab, tab_panel in zip(tabs, tab_panels):
tab_name = tab.get_text(strip=True)
yield f"{tab_name}\n"
yield from get_text(tab_panel)
elif child.name == "table":
thead = child.find("thead")
header_exists = isinstance(thead, Tag)
if header_exists:
headers = thead.find_all("th")
if headers:
yield "| "
yield " | ".join(header.get_text() for header in headers)
yield " |\n"
yield "| "
yield " | ".join("----" for _ in headers)
yield " |\n"
tbody = child.find("tbody")
tbody_exists = isinstance(tbody, Tag)
if tbody_exists:
for row in tbody.find_all("tr"):
yield "| "
yield " | ".join(
cell.get_text(strip=True) for cell in row.find_all("td")
)
yield " |\n"
yield "\n\n"
elif child.name in ["button"]:
continue
else:
yield from get_text(child)
joined = "".join(get_text(soup))
return re.sub(r"\n\n+", "\n\n", joined).strip()