Skip to content

Commit

Permalink
patch: remove unidecode as it was transliterating non-latin chars (#434)
Browse files Browse the repository at this point in the history
resolves #298
  • Loading branch information
timothycarambat authored Dec 13, 2023
1 parent b444171 commit da0cec7
Showing 1 changed file with 1 addition and 2 deletions.
3 changes: 1 addition & 2 deletions collector/scripts/watch/convert/as_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from slugify import slugify
from ..utils import guid, file_creation_time, write_to_server_documents, move_source
from ...utils import tokenize
from unidecode import unidecode

# Process all PDF-related documents.
def as_pdf(**kwargs):
Expand All @@ -29,7 +28,7 @@ def as_pdf(**kwargs):
page_content = ''
for page in fitz.open(fullpath):
print(f"-- Parsing content from pg {page.number} --")
page_content += unidecode(page.get_text('text'))
page_content += str(page.get_text('text'))

if len(page_content) == 0:
print(f"Resulting page content was empty - no text could be extracted from the document.")
Expand Down

0 comments on commit da0cec7

Please sign in to comment.