Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alert sentry when no attachments found for approved minutes #34

Merged
merged 2 commits into from
Jan 7, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 54 additions & 39 deletions lametro/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,17 @@ def __init__(self, event, legistar_api_url):
super().__init__(message)


class MissingAttachmentsException(Exception):
def __init__(self, matter_id, attachment_url):
message = (
f"No attachments for the approved minutes matter with an ID of {matter_id}. "
f"View the list of available attachments at <{attachment_url}>. "
"Contact Metro, and ask them to confirm whether this should have an attachment."
)

super().__init__(message)


class LametroEventScraper(LegistarAPIEventScraper, Scraper):
BASE_URL = "https://webapi.legistar.com/v1/metro"
WEB_URL = "https://metro.legistar.com/"
Expand Down Expand Up @@ -541,45 +552,49 @@ def find_approved_minutes(self, event):

attachments = self.get(attachment_url).json()

if len(attachments) == 0:
raise ValueError("No attachments for the approved minutes matter")
elif len(attachments) == 1:
yield attachments[0]
n_minutes += 1
else:
"""
Multiple attachments have been found.
Return only those that look like minutes files.
"""
for attach in attachments:
url = attach["MatterAttachmentHyperlink"]
response = requests.get(url)

with io.BytesIO(response.content) as filestream:
try:
pdf = pdfplumber.open(filestream)
except PDFSyntaxError as e:
capture_message(
f"PDFPlumber encountered an error opening a file: {e}",
"warning"
)
continue
cover_page = pdf.pages[0]

cover_page_text = cover_page.extract_text()
if not cover_page_text:
# No extractable text found.
# Turn the page into an image and use OCR to get text.
pdf_image = cover_page.to_image(resolution=150)

with io.BytesIO() as in_mem_image:
pdf_image.save(in_mem_image)
in_mem_image.seek(0)
cover_page_text = pytesseract.image_to_string(Image.open(in_mem_image))

if "MINUTES" in cover_page_text.upper():
yield attach
n_minutes += 1
try:
if len(attachments) == 0:
raise MissingAttachmentsException(matter["MatterId"], attachment_url)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiple indentation levels can make code more difficult to read. What do you think about doing something like:

for matter in result:
...
    try:
        if len(attachments) == 0:
            raise MissingAttachmentsException(matter["MatterId"], attachment_url)
    except MissingAttachmentsException as e:
        capture_exception(e)
        continue

    if len(attachments) == 1:
    ...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see what you mean, you're totally right!

elif len(attachments) == 1:
yield attachments[0]
n_minutes += 1
else:
"""
Multiple attachments have been found.
Return only those that look like minutes files.
"""
for attach in attachments:
url = attach["MatterAttachmentHyperlink"]
response = requests.get(url)

with io.BytesIO(response.content) as filestream:
try:
pdf = pdfplumber.open(filestream)
except PDFSyntaxError as e:
capture_message(
f"PDFPlumber encountered an error opening a file: {e}",
"warning"
)
continue
cover_page = pdf.pages[0]

cover_page_text = cover_page.extract_text()
if not cover_page_text:
# No extractable text found.
# Turn the page into an image and use OCR to get text.
pdf_image = cover_page.to_image(resolution=150)

with io.BytesIO() as in_mem_image:
pdf_image.save(in_mem_image)
in_mem_image.seek(0)
cover_page_text = pytesseract.image_to_string(Image.open(in_mem_image))

if "MINUTES" in cover_page_text.upper():
yield attach
n_minutes += 1
except MissingAttachmentsException as e:
capture_exception(e)
continue

if n_minutes == 0:
self.warning(
Expand Down