diff --git a/connectors/db/file.py b/connectors/db/file.py index 2752067..20b4a95 100644 --- a/connectors/db/file.py +++ b/connectors/db/file.py @@ -1,8 +1,8 @@ import logging -import re import os +import re import string -from io import StringIO, BytesIO +from io import BytesIO, StringIO from typing import Optional import html2text @@ -10,9 +10,8 @@ import PyPDF2 import pytablereader as ptr from bs4 import BeautifulSoup -from tabledata import TableData - from docling.document_converter import DocumentConverter, DocumentStream +from tabledata import TableData log = logging.getLogger("tangerine.file") @@ -62,15 +61,21 @@ def _remove_large_md_code_blocks(text): code_lines = [] in_code_block = False for line in text.split("\n"): - if line.strip() == "```" and not in_code_block: + if line.lstrip().startswith("```") and not in_code_block: in_code_block = True code_lines = [] code_lines.append(line) - elif line.strip() == "```" and in_code_block: + elif line.lstrip().startswith("```") and in_code_block: code_lines.append(line) in_code_block = False if len(code_lines) > 9: - code_lines = ["```", "", "```"] + # remove this block because it is too long, but preserve indentation of the block + whitespace = " " * (len(line) - len(line.lstrip())) + code_lines = [ + line, + f"{whitespace}", + line, + ] lines.extend(code_lines) elif in_code_block: code_lines.append(line) diff --git a/tests/test_md_processing.py b/tests/test_md_processing.py index 44ab043..1d85a3e 100644 --- a/tests/test_md_processing.py +++ b/tests/test_md_processing.py @@ -1,5 +1,6 @@ import pytest -from connectors.db.file import _convert_relative_links + +from connectors.db.file import _convert_relative_links, _remove_large_md_code_blocks @pytest.mark.parametrize( @@ -39,3 +40,46 @@ def test_link_conversion_bad_links(): # text should be unmodified assert _convert_relative_links(test_txt, base_url) == test_txt + + +def test_remove_large_code_blocks(): + test_txt = """ + This is a sample md file + + ``` + short code block + ``` + + Text in between code blocks + + ``` + long code block + with + length + greater + than + nine + lines + long + ``` + + Text after code blocks + """ + + expected_txt = """ + This is a sample md file + + ``` + short code block + ``` + + Text in between code blocks + + ``` + + ``` + + Text after code blocks + """ + + assert _remove_large_md_code_blocks(test_txt) == expected_txt