Skip to content

Commit

Permalink
Update the pypdf2 to pypdf because pypdf2 is no longer maintained
Browse files Browse the repository at this point in the history
Signed-off-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
aakankshaduggal committed Sep 25, 2024
1 parent bff2796 commit 2e224b6
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 6 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ openai>=1.13.3,<2.0.0
# removed once that one is removed.
# do not use 8.4.0 due to a bug in the library
# https://github.com/instructlab/instructlab/issues/1389
pypdf>=5.0.0
tabulate>=0.9.0
tenacity>=8.3.0,!=8.4.0
transformers>=4.44.2
Expand Down
10 changes: 4 additions & 6 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
)
import git
import gitdb
import PyPDF2
from pypdf import PdfReader
import yaml

# Local
Expand Down Expand Up @@ -103,7 +103,6 @@ def _get_taxonomy(repo="taxonomy"):
taxonomy_file_paths.append(str(file_path))
return taxonomy_file_paths


def _get_documents(
source: Dict[str, Union[str, List[str]]],
skip_checkout: bool = False,
Expand Down Expand Up @@ -145,10 +144,10 @@ def _get_documents(
elif file_path.endswith(".pdf"):
# Process PDF files
with open(file_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
reader = PdfReader(file) # Use pypdf PdfReader
pdf_text = ""
for page in range(len(reader.pages)):
pdf_text += reader.pages[page].extract_text()
for page in reader.pages: # Iterating through pages
pdf_text += page.extract_text()
file_contents.append(pdf_text)

if file_contents:
Expand All @@ -158,7 +157,6 @@ def _get_documents(
except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
logger.error("Error retrieving documents: %s", str(e))
raise e


# pylint: disable=broad-exception-caught
def _read_taxonomy_file(file_path: str | Path, yamllint_config: str | None = None):
Expand Down

0 comments on commit 2e224b6

Please sign in to comment.