From 54971104f8bcc909869fd79c775934e9e5a755ea Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 12 Aug 2024 16:03:02 +0100 Subject: [PATCH 1/3] Upgrade unstructured --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index e326c876e..f261fca83 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -30,5 +30,5 @@ tiktoken torch tqdm==4.66.3 transformers==4.43.4 -unstructured==0.12.2 +unstructured==0.15.1 Werkzeug==3.0.3 From 5a2f3ad616acc5cd25456fd8e287d8d9d0340e91 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 12 Aug 2024 16:35:23 +0100 Subject: [PATCH 2/3] feat: remove dep --- application/parser/file/html_parser.py | 66 ++------------------------ application/requirements.txt | 2 +- 2 files changed, 5 insertions(+), 63 deletions(-) diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py index 2a74a97a5..9db23d984 100644 --- a/application/parser/file/html_parser.py +++ b/application/parser/file/html_parser.py @@ -18,66 +18,8 @@ def _init_parser(self) -> Dict: return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: - """Parse file. + from langchain_community.document_loaders import BSHTMLLoader - Returns: - Union[str, List[str]]: a string or a List of strings. - """ - try: - from unstructured.partition.html import partition_html - from unstructured.staging.base import convert_to_isd - from unstructured.cleaners.core import clean - except ImportError: - raise ValueError("unstructured package is required to parse HTML files.") - - # Using the unstructured library to convert the html to isd format - # isd sample : isd = [ - # {"text": "My Title", "type": "Title"}, - # {"text": "My Narrative", "type": "NarrativeText"} - # ] - with open(file, "r", encoding="utf-8") as fp: - elements = partition_html(file=fp) - isd = convert_to_isd(elements) - - # Removing non ascii charactwers from isd_el['text'] - for isd_el in isd: - isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() - - # Removing all the \n characters from isd_el['text'] using regex and replace with single space - # Removing all the extra spaces from isd_el['text'] using regex and replace with single space - for isd_el in isd: - isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL) - isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL) - - # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation - for isd_el in isd: - clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True) - - # Creating a list of all the indexes of isd_el['type'] = 'Title' - title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title'] - - # Creating 'Chunks' - List of lists of strings - # each list starting with isd_el['type'] = 'Title' and all the data till the next 'Title' - # Each Chunk can be thought of as an individual set of data, which can be sent to the model - # Where Each Title is grouped together with the data under it - - Chunks = [[]] - final_chunks = list(list()) - - for i, isd_el in enumerate(isd): - if i in title_indexes: - Chunks.append([]) - Chunks[-1].append(isd_el['text']) - - # Removing all the chunks with sum of length of all the strings in the chunk < 25 - # TODO: This value can be an user defined variable - for chunk in Chunks: - # sum of length of all the strings in the chunk - sum = 0 - sum += len(str(chunk)) - if sum < 25: - Chunks.remove(chunk) - else: - # appending all the approved chunks to final_chunks as a single string - final_chunks.append(" ".join([str(item) for item in chunk])) - return final_chunks + loader = BSHTMLLoader(file) + data = loader.load() + return data diff --git a/application/requirements.txt b/application/requirements.txt index f261fca83..cd7e21c2c 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -1,5 +1,6 @@ anthropic==0.12.0 boto3==1.34.153 +beautifulsoup4==4.12.3 celery==5.3.6 dataclasses_json==0.6.7 docx2txt==0.8 @@ -30,5 +31,4 @@ tiktoken torch tqdm==4.66.3 transformers==4.43.4 -unstructured==0.15.1 Werkzeug==3.0.3 From 16aedd61da9c1361f2859a3970f720373196f6f2 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 12 Aug 2024 16:37:03 +0100 Subject: [PATCH 3/3] fix: ruff lint --- application/parser/file/html_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py index 9db23d984..4d69c8709 100644 --- a/application/parser/file/html_parser.py +++ b/application/parser/file/html_parser.py @@ -3,7 +3,6 @@ Contains parser for html files. """ -import re from pathlib import Path from typing import Dict, Union