Skip to content

Commit

Permalink
Cache SP name resolution by id
Browse files Browse the repository at this point in the history
  • Loading branch information
ajparsons committed Apr 22, 2024
1 parent a193ed0 commit 20cc60c
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 5 deletions.
7 changes: 5 additions & 2 deletions pyscraper/sp_2024/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,15 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
# iterate through the agenda_item's subitems
# if item is a speech or a division, get the new informaton and bring across
previous_speech = None
missing_speakers = []
for subitem in item.find("parsed"):
if subitem.tag == "speech":
speaker_name = subitem.get("speaker_name")
person_id = get_unique_person_id(speaker_name, iso_date)
if person_id is None and verbose:
scot_parl_id = subitem.get("speaker_scot_id")
person_id = get_unique_person_id(speaker_name, iso_date, lookup_key=scot_parl_id)
if person_id is None and speaker_name not in missing_speakers and verbose:
print(f"Could not find person id for {speaker_name}")
missing_speakers.append(speaker_name)
speech = etree.Element("speech")
speech.set("id", id_factory.get_next_minor_id())
speech.set("url", subitem.get("speech_url") or "")
Expand Down
37 changes: 34 additions & 3 deletions pyscraper/sp_2024/resolvenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,15 @@ def match_whole_speaker(self, speaker_name, speaker_date):
m = re.search("^([^\(]*)(.*)", speaker_name)
first_part = m.group(1).strip()
bracketed_parts = m.group(2).strip()

ids_from_first_part = memberList.match_string_somehow(
first_part, speaker_date, party, False
)

if ids_from_first_part is None and bracketed_parts:

This comment has been minimized.

Copy link
@dracos

dracos Apr 23, 2024

Member

Does this then replace the bracketed_parts while loop further down? Would make this fn quite a bit smaller if it did, not sure what that loop is doing though.

This comment has been minimized.

Copy link
@ajparsons

ajparsons Apr 23, 2024

Author Contributor

That loop seems to be limiting down if multiple people were returned based on name - unsure the conditions the brackets help when that happens - so this isn't doing quite the same thing.

ids_from_first_part = memberList.match_string_somehow(
bracketed_parts, speaker_date, party, False
)

if ids_from_first_part is None:
return None
else:
Expand Down Expand Up @@ -356,7 +361,32 @@ def log_speaker(speaker, date, message):
fp.write(str(date) + ": [" + message + "] " + speaker + "\n")


def get_unique_person_id(tidied_speaker: str, on_date: str):
class IDCache:
def __init__(self):
self.cache = {}

def check(self, key: str | None) -> None | str:
if key is not None:
return self.cache.get(key, None)
else:
return None

def set(self, key: str | None, value: str):
if key:
self.cache[key] = value
return value


id_cache = IDCache()


def get_unique_person_id(
tidied_speaker: str, on_date: str, lookup_key: str | None = None
):
# check we haven't cached this one first
if v := id_cache.check(lookup_key):
return v

ids = memberList.match_whole_speaker(tidied_speaker, str(on_date))
if ids is None:
# This special return value (None) indicates that the speaker
Expand All @@ -367,7 +397,8 @@ def get_unique_person_id(tidied_speaker: str, on_date: str):
log_speaker(tidied_speaker, str(on_date), "missing")
return None
elif len(ids) == 1:
return ids[0]
# cache for future lookup
return id_cache.set(lookup_key, ids[0])
else:
raise Exception(
f"The speaker '{tidied_speaker}' could not be resolved, found: {ids}"
Expand Down

0 comments on commit 20cc60c

Please sign in to comment.