From 6ed58add829403955749d5287f6c7207a820b87f Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 28 Nov 2023 16:05:29 -0600 Subject: [PATCH] Add token to event search requests --- lametro/events.py | 438 ++++++++++++++++++++++++++++------------------ 1 file changed, 265 insertions(+), 173 deletions(-) diff --git a/lametro/events.py b/lametro/events.py index 5e436da..49af8c7 100644 --- a/lametro/events.py +++ b/lametro/events.py @@ -5,20 +5,38 @@ from pupa.scrape import Event, Scraper from legistar.base import LegistarScraper +import requests + +try: + from .secrets import TOKEN +except: + TOKEN = None + LOGGER = logging.getLogger(__name__) + class UnmatchedEventError(Exception): def __init__(self, events): message_format = "Can't find companion for Event {0} at {1} on {2} - {3} {4}" if type(events) is dict: - message = message_format.format(events['EventId'], events['EventTime'], \ - events['EventDate'], EventInSiteURL['EventInSiteURL'], '') + message = message_format.format( + events["EventId"], + events["EventTime"], + events["EventDate"], + EventInSiteURL["EventInSiteURL"], + "", + ) elif type(events) is list: - message = '' + message = "" for event in events: - temp = message_format.format(event['EventId'], event['EventTime'], \ - event['EventDate'], event['EventInSiteURL'], '\n') + temp = message_format.format( + event["EventId"], + event["EventTime"], + event["EventDate"], + event["EventInSiteURL"], + "\n", + ) message += temp else: message = "Can't find companion event" @@ -27,8 +45,8 @@ def __init__(self, events): class LametroEventScraper(LegistarAPIEventScraper, Scraper): - BASE_URL = 'http://webapi.legistar.com/v1/metro' - WEB_URL = 'https://metro.legistar.com/' + BASE_URL = "https://webapi.legistar.com/v1/metro" + WEB_URL = "https://metro.legistar.com/" EVENTSPAGE = "https://metro.legistar.com/Calendar.aspx" TIMEZONE = "America/Los_Angeles" @@ -50,16 +68,15 @@ def _pair_events(self, events): return paired_events, unpaired_events.values() def _find_partner(self, event): - ''' + """ Attempt to find other-language partner of an event. Sometimes English events won't have Spanish partners, but every Spanish event should have an English partner. - ''' - results = list(self.search('/events/', 'EventId', - event.partner_search_string)) + """ + results = list(self.search("/events/", "EventId", event.partner_search_string)) if results: - partner, = results + (partner,) = results partner = LAMetroAPIEvent(partner) assert event.is_partner(partner) return partner @@ -70,8 +87,24 @@ def _find_partner(self, event): return None + def search(self, route, item_key, search_conditions): + search_url = self.BASE_URL + route + + params = {"$filter": search_conditions} + + # Add token to requests to the events endpoint, so we can capture SAP events. + if route == "/events/" and TOKEN: + params["token"] = TOKEN + + try: + yield from self.pages(search_url, params=params, item_key=item_key) + except requests.HTTPError as e: + if e.response.status_code == 400: + raise ValueError(e.response.json()["Message"]) + raise + def api_events(self, *args, **kwargs): - ''' + """ For meetings, Metro provides an English audio recording and sometimes a Spanish audio translation. Due to limitations with the InSite system, multiple audio recordings can't be @@ -87,11 +120,12 @@ def api_events(self, *args, **kwargs): This method subclasses the normal api_event method to ensure that we get both members of pairs. - ''' - partial_scrape = kwargs.get('since_datetime', False) + """ + partial_scrape = kwargs.get("since_datetime", False) - events = (LAMetroAPIEvent(event) for event - in super().api_events(*args, **kwargs)) + events = ( + LAMetroAPIEvent(event) for event in super().api_events(*args, **kwargs) + ) paired, unpaired = self._pair_events(events) @@ -114,7 +148,9 @@ def api_events(self, *args, **kwargs): partner_event = self._find_partner(unpaired_event) spanish_start_date = datetime.datetime(2018, 5, 15, 0, 0, 0, 0) - event_date = datetime.datetime.strptime(unpaired_event['EventDate'], '%Y-%m-%dT%H:%M:%S') + event_date = datetime.datetime.strptime( + unpaired_event["EventDate"], "%Y-%m-%dT%H:%M:%S" + ) if partner_event is not None: yield partner_event @@ -140,7 +176,11 @@ def _merge_events(self, events): # back event time as a match constraint. See: # https://github.com/opencivicdata/scrapers-us-municipal/pull/284 & # https://github.com/opencivicdata/scrapers-us-municipal/pull/309. - raise ValueError('{0} already exists as a key with a value of {1}'.format(event.key, spanish_events[event.key])) + raise ValueError( + "{0} already exists as a key with a value of {1}".format( + event.key, spanish_events[event.key] + ) + ) spanish_events[event.key] = (event, web_event) else: english_events.append((event, web_event)) @@ -149,39 +189,47 @@ def _merge_events(self, events): event_details = [] event_audio = [] - event_details.append({ - 'url': web_event['Meeting Details']['url'], - 'note': 'web', - }) + event_details.append( + { + "url": web_event["Meeting Details"]["url"], + "note": "web", + } + ) if web_event.has_audio: - event_audio.append(web_event['Meeting video']) + event_audio.append(web_event["Meeting video"]) matches = spanish_events.pop(event.partner_key, None) if matches: spanish_event, spanish_web_event = matches - event['SAPEventId'] = spanish_event['EventId'] - event['SAPEventGuid'] = spanish_event['EventGuid'] + event["SAPEventId"] = spanish_event["EventId"] + event["SAPEventGuid"] = spanish_event["EventGuid"] - event_details.append({ - 'url': spanish_web_event['Meeting Details']['url'], - 'note': 'web (sap)', - }) + event_details.append( + { + "url": spanish_web_event["Meeting Details"]["url"], + "note": "web (sap)", + } + ) if spanish_web_event.has_audio: - spanish_web_event['Meeting video']['label'] = 'Audio (SAP)' - event_audio.append(spanish_web_event['Meeting video']) + spanish_web_event["Meeting video"]["label"] = "Audio (SAP)" + event_audio.append(spanish_web_event["Meeting video"]) - event['event_details'] = event_details - event['audio'] = event_audio + event["event_details"] = event_details + event["audio"] = event_audio try: - assert not spanish_events # These should all be merged with an English event. + assert ( + not spanish_events + ) # These should all be merged with an English event. except AssertionError: unpaired_events = [event for event, _ in spanish_events.values()] - LOGGER.critical(f"Found {len(unpaired_events)} Spanish event(s) without partners.") + LOGGER.critical( + f"Found {len(unpaired_events)} Spanish event(s) without partners." + ) return english_events @@ -194,19 +242,24 @@ def scrape(self, window=None): events = self.events(since_datetime=n_days_ago) service_councils = set( - sc['BodyId'] for sc in self.search('/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75') + sc["BodyId"] + for sc in self.search( + "/bodies/", "BodyId", "BodyTypeId eq 70 or BodyTypeId eq 75" + ) ) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] - if 'Board of Directors -' in body_name: - body_name, event_name = [part.strip() - for part - in body_name.split('-')] - elif event['EventBodyId'] in service_councils: + if "Board of Directors -" in body_name: + body_name, event_name = [part.strip() for part in body_name.split("-")] + elif event["EventBodyId"] in service_councils: # Don't scrape service council or service council public hearing events. - self.info('Skipping event {0} for {1}'.format(event['EventId'], event['EventBodyName'])) + self.info( + "Skipping event {0} for {1}".format( + event["EventId"], event["EventBodyName"] + ) + ) continue else: event_name = body_name @@ -214,15 +267,15 @@ def scrape(self, window=None): # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." - status_name = event['EventAgendaStatusName'] - if status_name.startswith('Final'): - status = 'passed' - elif status_name == 'Draft': - status = 'confirmed' - elif status_name == 'Canceled': - status = 'cancelled' + status_name = event["EventAgendaStatusName"] + if status_name.startswith("Final"): + status = "passed" + elif status_name == "Draft": + status = "confirmed" + elif status_name == "Canceled": + status = "cancelled" else: - status = 'tentative' + status = "tentative" location = event["EventLocation"] @@ -230,33 +283,38 @@ def scrape(self, window=None): # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. - location = 'Not available' + location = "Not available" - e = Event(event_name, - start_date=event["start"], - description='', - location_name=location, - status=status) + e = Event( + event_name, + start_date=event["start"], + description="", + location_name=location, + status=status, + ) - e.pupa_id = str(event['EventId']) + e.pupa_id = str(event["EventId"]) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. - e.extras = {'guid': event['EventGuid']} + e.extras = {"guid": event["EventGuid"]} - legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId']) - e.add_source(legistar_api_url, note='api') + legistar_api_url = self.BASE_URL + "/events/{0}".format(event["EventId"]) + e.add_source(legistar_api_url, note="api") - if event.get('SAPEventGuid'): - e.extras['sap_guid'] = event['SAPEventGuid'] + if event.get("SAPEventGuid"): + e.extras["sap_guid"] = event["SAPEventGuid"] if web_event.has_ecomment: - self.info('Adding eComment link {0} from {1}'.format(web_event['eComment'], - web_event['Meeting Details']['url'])) - e.extras['ecomment'] = web_event['eComment'] + self.info( + "Adding eComment link {0} from {1}".format( + web_event["eComment"], web_event["Meeting Details"]["url"] + ) + ) + e.extras["ecomment"] = web_event["eComment"] - if 'event_details' in event: + if "event_details" in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): @@ -269,64 +327,87 @@ def scrape(self, window=None): # To the notes field, add the item number as given in the agenda minutes agenda_number = item["EventItemAgendaNumber"] note = "Agenda number, {}".format(agenda_number) - agenda_item['notes'].append(note) + agenda_item["notes"].append(note) - agenda_item['extras']['agenda_number'] = agenda_number + agenda_item["extras"]["agenda_number"] = agenda_number # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. - agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence'] + agenda_item["extras"]["item_agenda_sequence"] = item[ + "EventItemAgendaSequence" + ] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. - item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda] + item_agenda_sequences = [ + item["extras"]["item_agenda_sequence"] for item in e.agenda + ] if len(item_agenda_sequences) != len(set(item_agenda_sequences)): - error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ + error_msg = "An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ - Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' - - raise ValueError(error_msg.format(event_name=e.name, - event_date=e.start_date.strftime("%B %d, %Y"), - legistar_api_url=legistar_api_url)) + Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence." + + raise ValueError( + error_msg.format( + event_name=e.name, + event_date=e.start_date.strftime("%B %d, %Y"), + legistar_api_url=legistar_api_url, + ) + ) - e.add_participant(name=body_name, - type="organization") + e.add_participant(name=body_name, type="organization") - if event.get('SAPEventId'): - e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), - note='api (sap)') + if event.get("SAPEventId"): + e.add_source( + self.BASE_URL + "/events/{0}".format(event["SAPEventId"]), + note="api (sap)", + ) - if event['EventAgendaFile']: - e.add_document(note='Agenda', - url=event['EventAgendaFile'], - media_type="application/pdf", - date=self.to_utc_timestamp(event['EventAgendaLastPublishedUTC']).date()) + if event["EventAgendaFile"]: + e.add_document( + note="Agenda", + url=event["EventAgendaFile"], + media_type="application/pdf", + date=self.to_utc_timestamp( + event["EventAgendaLastPublishedUTC"] + ).date(), + ) # in case this event's minutes haven't been approved yet - e.extras['approved_minutes'] = False - - if event['EventMinutesFile']: - e.add_document(note='Minutes', - url=event['EventMinutesFile'], - media_type="application/pdf", - date=self.to_utc_timestamp(event['EventMinutesLastPublishedUTC']).date()) - elif web_event['Published minutes'] != 'Not\xa0available': - e.add_document(note=web_event['Published minutes']['label'], - url=web_event['Published minutes']['url'], - media_type="application/pdf") + e.extras["approved_minutes"] = False + + if event["EventMinutesFile"]: + e.add_document( + note="Minutes", + url=event["EventMinutesFile"], + media_type="application/pdf", + date=self.to_utc_timestamp( + event["EventMinutesLastPublishedUTC"] + ).date(), + ) + elif web_event["Published minutes"] != "Not\xa0available": + e.add_document( + note=web_event["Published minutes"]["label"], + url=web_event["Published minutes"]["url"], + media_type="application/pdf", + ) else: approved_minutes = self.find_approved_minutes(event) if approved_minutes: - e.add_document(note=approved_minutes['MatterAttachmentName'], - url=approved_minutes['MatterAttachmentHyperlink'], - media_type="application/pdf", - date=self.to_utc_timestamp(approved_minutes['MatterAttachmentLastModifiedUtc']).date()) - e.extras['approved_minutes'] = True + e.add_document( + note=approved_minutes["MatterAttachmentName"], + url=approved_minutes["MatterAttachmentHyperlink"], + media_type="application/pdf", + date=self.to_utc_timestamp( + approved_minutes["MatterAttachmentLastModifiedUtc"] + ).date(), + ) + e.extras["approved_minutes"] = True - for audio in event['audio']: + for audio in event["audio"]: try: - redirect_url = self.head(audio['url']).headers['Location'] + redirect_url = self.head(audio["url"]).headers["Location"] except KeyError: # In some cases, the redirect URL does not yet @@ -345,21 +426,23 @@ def scrape(self, window=None): # Because of the way that the event['audio'] is created # the first audio link is always English and the # second is always Spanish - e.add_media_link(note=audio['label'], - url=redirect_url, - media_type='text/html', - on_duplicate='ignore') - - if event['event_details']: - for link in event['event_details']: + e.add_media_link( + note=audio["label"], + url=redirect_url, + media_type="text/html", + on_duplicate="ignore", + ) + + if event["event_details"]: + for link in event["event_details"]: e.add_source(**link) else: - e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') + e.add_source("https://metro.legistar.com/Calendar.aspx", note="web") yield e def _suppress_item_matter(self, item, agenda_url): - ''' + """ Agenda items in Legistar do not always display links to associated matter files even if the same agenda item in the API references a Matter File. The agenda items @@ -369,22 +452,21 @@ def _suppress_item_matter(self, item, agenda_url): This is also practical because matter files that are hidden in the Legistar Agenda do not seem to available for scraping on Legistar or through the API - ''' - - if item['EventItemMatterFile'] is not None: + """ - if item['EventItemMatterStatus'] == 'Draft': + if item["EventItemMatterFile"] is not None: + if item["EventItemMatterStatus"] == "Draft": suppress = True - elif item['EventItemMatterType'] == 'Closed Session': + elif item["EventItemMatterType"] == "Closed Session": suppress = True else: suppress = False if suppress: - item['EventItemMatterFile'] = None + item["EventItemMatterFile"] = None def find_approved_minutes(self, event): - ''' + """ The minutes of some meetings are available as a legislative item that are approved at the subsequent meeting. This method tries to find them. @@ -393,46 +475,52 @@ def find_approved_minutes(self, event): here, it avoids many complicated and expensive queries in the councilmatic app. - ''' - name = event['EventBodyName'] + """ + name = event["EventBodyName"] - if name not in {'Board of Directors - Regular Board Meeting', - 'LA SAFE'}: + if name not in {"Board of Directors - Regular Board Meeting", "LA SAFE"}: return None # if the event is the future, there won't have been a chance to # approve the minutes - if event['start'] > datetime.datetime.now(datetime.timezone.utc): + if event["start"] > datetime.datetime.now(datetime.timezone.utc): return None - date = event['start'].strftime('%B %-d, %Y') + date = event["start"].strftime("%B %-d, %Y") result = self.search( - '/matters/', - 'MatterId', - "MatterBodyId eq {} and substringof('{}', MatterTitle) and substringof('Minutes', MatterTitle)".format(event['EventBodyId'], date)) + "/matters/", + "MatterId", + "MatterBodyId eq {} and substringof('{}', MatterTitle) and substringof('Minutes', MatterTitle)".format( + event["EventBodyId"], date + ), + ) try: - matter, = result + (matter,) = result except ValueError as e: - if 'not enough values' in str(e): + if "not enough values" in str(e): self.warning( - "Couldn't find minutes for the {} meeting of {}."\ - .format(name, date)) + "Couldn't find minutes for the {} meeting of {}.".format(name, date) + ) return None - elif 'too many values to unpack' in str(e): + elif "too many values to unpack" in str(e): self.warning( - "Found more than one minutes file for the {} meeting of {}."\ - .format(name, date)) + "Found more than one minutes file for the {} meeting of {}.".format( + name, date + ) + ) return None else: raise - attachment_url = self.BASE_URL + '/matters/{}/attachments'.format(matter['MatterId']) + attachment_url = self.BASE_URL + "/matters/{}/attachments".format( + matter["MatterId"] + ) attachments = self.get(attachment_url).json() if len(attachments) == 0: - raise ValueError('No attachments for the approved minutes matter') + raise ValueError("No attachments for the approved minutes matter") elif len(attachments) == 1: return attachments[0] else: @@ -441,94 +529,98 @@ def find_approved_minutes(self, event): # to the name of the attachment representing the correct minutes # file. handled_cases = { - 'May 28, 2015': 'Regular Board Meeting Minutes on May 28, 2015', - 'September 24, 2020': 'LA SAFE Minutes - September 24, 2020', - 'June 24, 2021': 'LA SAFE MINUTES - June 24, 2021', - 'December 2, 2021': 'Regular Board Meeting MINUTES - December 2, 2021', - 'January 27, 2022': 'Regular Board Meeting MINUTES - January 27, 2022', - 'February 24, 2022': 'MINUTES - February 24, 2022 RBM', - 'June 23, 2022': 'Regular Board Meeting MINUTES - June 23, 2022', - 'December 1, 2022': 'Regular Board Meeting MINUTES - December 1, 2022' + "May 28, 2015": "Regular Board Meeting Minutes on May 28, 2015", + "September 24, 2020": "LA SAFE Minutes - September 24, 2020", + "June 24, 2021": "LA SAFE MINUTES - June 24, 2021", + "December 2, 2021": "Regular Board Meeting MINUTES - December 2, 2021", + "January 27, 2022": "Regular Board Meeting MINUTES - January 27, 2022", + "February 24, 2022": "MINUTES - February 24, 2022 RBM", + "June 23, 2022": "Regular Board Meeting MINUTES - June 23, 2022", + "December 1, 2022": "Regular Board Meeting MINUTES - December 1, 2022", } if date in handled_cases: attachment_name = handled_cases[date] - attachment, = [ - each for each in attachments - if each['MatterAttachmentName'] == attachment_name + (attachment,) = [ + each + for each in attachments + if each["MatterAttachmentName"] == attachment_name ] return attachment else: try: - attachment, = [ - each for each in attachments - if 'minutes' in each['MatterAttachmentName'].lower() + (attachment,) = [ + each + for each in attachments + if "minutes" in each["MatterAttachmentName"].lower() ] except ValueError: LOGGER.critical( "More than one attachment for the approved minutes matter" ) else: - msg = 'More than attachment for minutes matter {0}, using {1}'.format( - matter['MatterId'], attachment['MatterAttachmentName'] + msg = ( + "More than attachment for minutes matter {0}, using {1}".format( + matter["MatterId"], attachment["MatterAttachmentName"] + ) ) self.info(msg) return attachment - class LAMetroAPIEvent(dict): - ''' + """ This class is for adding methods to the API event dict to faciliate maching events with their other-language partners. - ''' + """ + @property def is_spanish(self): - return self['EventBodyName'].endswith(' (SAP)') + return self["EventBodyName"].endswith(" (SAP)") @property def _partner_name(self): if self.is_spanish: - return self['EventBodyName'].rstrip(' (SAP)') + return self["EventBodyName"].rstrip(" (SAP)") else: - return self['EventBodyName'] + ' (SAP)' + return self["EventBodyName"] + " (SAP)" def is_partner(self, other): - return (self._partner_name == other['EventBodyName'] and - self['EventDate'] == other['EventDate']) + return ( + self._partner_name == other["EventBodyName"] + and self["EventDate"] == other["EventDate"] + ) @property def partner_search_string(self): search_string = "EventBodyName eq '{}'".format(self._partner_name) - search_string += " and EventDate eq datetime'{}'".format(self['EventDate']) + search_string += " and EventDate eq datetime'{}'".format(self["EventDate"]) return search_string @property def partner_key(self): - return (self._partner_name, self['EventDate']) + return (self._partner_name, self["EventDate"]) @property def key(self): - return (self['EventBodyName'], self['EventDate']) + return (self["EventBodyName"], self["EventDate"]) class LAMetroWebEvent(dict): - ''' + """ This class is for adding methods to the web event dict to facilitate labeling and sourcing audio appropriately. - ''' + """ - web_scraper = LegistarScraper(retry_attempts=3, - requests_per_minute=0) + web_scraper = LegistarScraper(retry_attempts=3, requests_per_minute=0) @property def has_audio(self): - return self['Meeting video'] != 'Not\xa0available' + return self["Meeting video"] != "Not\xa0available" @property def has_ecomment(self): - return self['eComment'] != 'Not\xa0available' - + return self["eComment"] != "Not\xa0available"