-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1405 from JeevaRamanathan/feature/file-pptx_parser
feat: `.pptx` Presentation parser implementation
- Loading branch information
Showing
10 changed files
with
88 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -343,6 +343,7 @@ def post(self): | |
".mdx", | ||
".json", | ||
".xlsx", | ||
".pptx", | ||
], | ||
job_name, | ||
final_filename, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
"""PPT parser. | ||
Contains parsers for presentation (.pptx) files to extract slide text. | ||
""" | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Union | ||
|
||
from application.parser.file.base_parser import BaseParser | ||
|
||
class PPTXParser(BaseParser): | ||
r"""PPTX (.pptx) parser for extracting text from PowerPoint slides. | ||
Args: | ||
concat_slides (bool): Specifies whether to concatenate all slide text into one document. | ||
- If True, slide texts will be joined together as a single string. | ||
- If False, each slide's text will be stored as a separate entry in a list. | ||
Set to True by default. | ||
slide_separator (str): Separator used to join slides' text content. | ||
Only used when `concat_slides=True`. Default is "\n". | ||
Refer to https://python-pptx.readthedocs.io/en/latest/ for more information. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
*args: Any, | ||
concat_slides: bool = True, | ||
slide_separator: str = "\n", | ||
**kwargs: Any | ||
) -> None: | ||
"""Init params.""" | ||
super().__init__(*args, **kwargs) | ||
self._concat_slides = concat_slides | ||
self._slide_separator = slide_separator | ||
|
||
def _init_parser(self) -> Dict: | ||
"""Init parser.""" | ||
return {} | ||
|
||
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: | ||
r""" | ||
Parse a .pptx file and extract text from each slide. | ||
Args: | ||
file (Path): Path to the .pptx file. | ||
errors (str): Error handling policy ('ignore' by default). | ||
Returns: | ||
Union[str, List[str]]: Concatenated text if concat_slides is True, | ||
otherwise a list of slide texts. | ||
""" | ||
|
||
try: | ||
from pptx import Presentation | ||
except ImportError: | ||
raise ImportError("pptx module is required to read .PPTX files.") | ||
|
||
try: | ||
presentation = Presentation(file) | ||
slide_texts=[] | ||
|
||
# Iterate over each slide in the presentation | ||
for slide in presentation.slides: | ||
slide_text="" | ||
|
||
# Iterate over each shape in the slide | ||
for shape in slide.shapes: | ||
# Check if the shape has a 'text' attribute and append that to the slide_text | ||
if hasattr(shape,"text"): | ||
slide_text+=shape.text | ||
|
||
slide_texts.append(slide_text.strip()) | ||
|
||
if self._concat_slides: | ||
return self._slide_separator.join(slide_texts) | ||
else: | ||
return slide_texts | ||
|
||
except Exception as e: | ||
raise e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters