diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py
index 2a74a97a5..4d69c8709 100644
--- a/application/parser/file/html_parser.py
+++ b/application/parser/file/html_parser.py
@@ -3,7 +3,6 @@
Contains parser for html files.
"""
-import re
from pathlib import Path
from typing import Dict, Union
@@ -18,66 +17,8 @@ def _init_parser(self) -> Dict:
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
- """Parse file.
+ from langchain_community.document_loaders import BSHTMLLoader
- Returns:
- Union[str, List[str]]: a string or a List of strings.
- """
- try:
- from unstructured.partition.html import partition_html
- from unstructured.staging.base import convert_to_isd
- from unstructured.cleaners.core import clean
- except ImportError:
- raise ValueError("unstructured package is required to parse HTML files.")
-
- # Using the unstructured library to convert the html to isd format
- # isd sample : isd = [
- # {"text": "My Title", "type": "Title"},
- # {"text": "My Narrative", "type": "NarrativeText"}
- # ]
- with open(file, "r", encoding="utf-8") as fp:
- elements = partition_html(file=fp)
- isd = convert_to_isd(elements)
-
- # Removing non ascii charactwers from isd_el['text']
- for isd_el in isd:
- isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
-
- # Removing all the \n characters from isd_el['text'] using regex and replace with single space
- # Removing all the extra spaces from isd_el['text'] using regex and replace with single space
- for isd_el in isd:
- isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
- isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-
- # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
- for isd_el in isd:
- clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
-
- # Creating a list of all the indexes of isd_el['type'] = 'Title'
- title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
-
- # Creating 'Chunks' - List of lists of strings
- # each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title'
- # Each Chunk can be thought of as an individual set of data, which can be sent to the model
- # Where Each Title is grouped together with the data under it
-
- Chunks = [[]]
- final_chunks = list(list())
-
- for i, isd_el in enumerate(isd):
- if i in title_indexes:
- Chunks.append([])
- Chunks[-1].append(isd_el['text'])
-
- # Removing all the chunks with sum of length of all the strings in the chunk < 25
- # TODO: This value can be an user defined variable
- for chunk in Chunks:
- # sum of length of all the strings in the chunk
- sum = 0
- sum += len(str(chunk))
- if sum < 25:
- Chunks.remove(chunk)
- else:
- # appending all the approved chunks to final_chunks as a single string
- final_chunks.append(" ".join([str(item) for item in chunk]))
- return final_chunks
+ loader = BSHTMLLoader(file)
+ data = loader.load()
+ return data
diff --git a/application/requirements.txt b/application/requirements.txt
index e326c876e..cd7e21c2c 100644
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -1,5 +1,6 @@
anthropic==0.12.0
boto3==1.34.153
+beautifulsoup4==4.12.3
celery==5.3.6
dataclasses_json==0.6.7
docx2txt==0.8
@@ -30,5 +31,4 @@ tiktoken
torch
tqdm==4.66.3
transformers==4.43.4
-unstructured==0.12.2
Werkzeug==3.0.3