From 54971104f8bcc909869fd79c775934e9e5a755ea Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Mon, 12 Aug 2024 16:03:02 +0100
Subject: [PATCH 1/3] Upgrade unstructured

---
 application/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/application/requirements.txt b/application/requirements.txt
index e326c876e..f261fca83 100644
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -30,5 +30,5 @@ tiktoken
 torch
 tqdm==4.66.3
 transformers==4.43.4
-unstructured==0.12.2
+unstructured==0.15.1
 Werkzeug==3.0.3

From 5a2f3ad616acc5cd25456fd8e287d8d9d0340e91 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Mon, 12 Aug 2024 16:35:23 +0100
Subject: [PATCH 2/3] feat: remove dep

---
 application/parser/file/html_parser.py | 66 ++------------------------
 application/requirements.txt           |  2 +-
 2 files changed, 5 insertions(+), 63 deletions(-)

diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py
index 2a74a97a5..9db23d984 100644
--- a/application/parser/file/html_parser.py
+++ b/application/parser/file/html_parser.py
@@ -18,66 +18,8 @@ def _init_parser(self) -> Dict:
         return {}
 
     def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
-        """Parse file.
+        from langchain_community.document_loaders import BSHTMLLoader
 
-            Returns:
-            Union[str, List[str]]: a string or a List of strings.
-        """
-        try:
-            from unstructured.partition.html import partition_html
-            from unstructured.staging.base import convert_to_isd
-            from unstructured.cleaners.core import clean
-        except ImportError:
-            raise ValueError("unstructured package is required to parse HTML files.")
-
-        # Using the unstructured library to convert the html to isd format
-        # isd sample : isd = [
-        #   {"text": "My Title", "type": "Title"},
-        #   {"text": "My Narrative", "type": "NarrativeText"}
-        # ]
-        with open(file, "r", encoding="utf-8") as fp:
-            elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)
-
-            # Removing non ascii charactwers from isd_el['text']
-        for isd_el in isd:
-            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
-
-        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
-        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
-        for isd_el in isd:
-            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-
-        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
-        for isd_el in isd:
-            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
-
-        # Creating a list of all the indexes of isd_el['type'] = 'Title'
-        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
-
-        # Creating 'Chunks' - List of lists of strings 
-        # each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title'
-        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
-        # Where Each Title is grouped together with the data under it
-
-        Chunks = [[]]
-        final_chunks = list(list())
-
-        for i, isd_el in enumerate(isd):
-            if i in title_indexes:
-                Chunks.append([])
-            Chunks[-1].append(isd_el['text'])
-
-        # Removing all the chunks with sum of length of all the strings in the chunk < 25
-        # TODO: This value can be an user defined variable
-        for chunk in Chunks:
-            # sum of length of all the strings in the chunk
-            sum = 0
-            sum += len(str(chunk))
-            if sum < 25:
-                Chunks.remove(chunk)
-            else:
-                # appending all the approved chunks to final_chunks as a single string       
-                final_chunks.append(" ".join([str(item) for item in chunk]))
-        return final_chunks
+        loader = BSHTMLLoader(file)
+        data = loader.load()        
+        return data
diff --git a/application/requirements.txt b/application/requirements.txt
index f261fca83..cd7e21c2c 100644
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -1,5 +1,6 @@
 anthropic==0.12.0
 boto3==1.34.153
+beautifulsoup4==4.12.3
 celery==5.3.6
 dataclasses_json==0.6.7
 docx2txt==0.8
@@ -30,5 +31,4 @@ tiktoken
 torch
 tqdm==4.66.3
 transformers==4.43.4
-unstructured==0.15.1
 Werkzeug==3.0.3

From 16aedd61da9c1361f2859a3970f720373196f6f2 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Mon, 12 Aug 2024 16:37:03 +0100
Subject: [PATCH 3/3] fix: ruff lint

---
 application/parser/file/html_parser.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py
index 9db23d984..4d69c8709 100644
--- a/application/parser/file/html_parser.py
+++ b/application/parser/file/html_parser.py
@@ -3,7 +3,6 @@
 Contains parser for html files.
 
 """
-import re
 from pathlib import Path
 from typing import Dict, Union