Skip to content

Commit

Permalink
Fix bug when replacing large code block (#93)
Browse files Browse the repository at this point in the history
  • Loading branch information
bsquizz authored Dec 11, 2024
1 parent e0482e2 commit 8ee6e9a
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 8 deletions.
19 changes: 12 additions & 7 deletions connectors/db/file.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import logging
import re
import os
import re
import string
from io import StringIO, BytesIO
from io import BytesIO, StringIO
from typing import Optional

import html2text
import mdformat
import PyPDF2
import pytablereader as ptr
from bs4 import BeautifulSoup
from tabledata import TableData

from docling.document_converter import DocumentConverter, DocumentStream
from tabledata import TableData

log = logging.getLogger("tangerine.file")

Expand Down Expand Up @@ -62,15 +61,21 @@ def _remove_large_md_code_blocks(text):
code_lines = []
in_code_block = False
for line in text.split("\n"):
if line.strip() == "```" and not in_code_block:
if line.lstrip().startswith("```") and not in_code_block:
in_code_block = True
code_lines = []
code_lines.append(line)
elif line.strip() == "```" and in_code_block:
elif line.lstrip().startswith("```") and in_code_block:
code_lines.append(line)
in_code_block = False
if len(code_lines) > 9:
code_lines = ["```", "<large code block, visit documentation to view>", "```"]
# remove this block because it is too long, but preserve indentation of the block
whitespace = " " * (len(line) - len(line.lstrip()))
code_lines = [
line,
f"{whitespace}<large code block, visit documentation to view>",
line,
]
lines.extend(code_lines)
elif in_code_block:
code_lines.append(line)
Expand Down
46 changes: 45 additions & 1 deletion tests/test_md_processing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from connectors.db.file import _convert_relative_links

from connectors.db.file import _convert_relative_links, _remove_large_md_code_blocks


@pytest.mark.parametrize(
Expand Down Expand Up @@ -39,3 +40,46 @@ def test_link_conversion_bad_links():

# text should be unmodified
assert _convert_relative_links(test_txt, base_url) == test_txt


def test_remove_large_code_blocks():
test_txt = """
This is a sample md file
```
short code block
```
Text in between code blocks
```
long code block
with
length
greater
than
nine
lines
long
```
Text after code blocks
"""

expected_txt = """
This is a sample md file
```
short code block
```
Text in between code blocks
```
<large code block, visit documentation to view>
```
Text after code blocks
"""

assert _remove_large_md_code_blocks(test_txt) == expected_txt

0 comments on commit 8ee6e9a

Please sign in to comment.