Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kirkland scraper #445

Open
wants to merge 35 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
49fd814
Fix Kirkland scraper
rafe-murray May 30, 2024
6f52701
Update Montreal Est scraper
rafe-murray Jun 5, 2024
707e503
Merge branch 'master' into montreal_est_scraper
samJMA Oct 29, 2024
ca587f3
Fixed Errors
samJMA Oct 29, 2024
d35d876
Merge branch 'master' into kirkland_scraper
samJMA Oct 29, 2024
1bd0cde
Similar solution to burnaby web scraper.
samJMA Oct 29, 2024
0b49b23
Updated formatting
samJMA Nov 6, 2024
f8d2789
Updated formatting
samJMA Nov 6, 2024
f8a69cf
Merge branch 'opencivicdata:master' into kirkland_scraper
iepmas Nov 6, 2024
29c7f83
Merge branch 'opencivicdata:master' into montreal_est_scraper
iepmas Nov 6, 2024
3083d46
ca_qc_montreal_est: Make changes closer to original
jpmckinney Nov 7, 2024
6e7c15e
Fixed email encryption error
samJMA Nov 8, 2024
a4770b7
ca_qc_sainte_anne_de_bellevue: Squash #447 after simplifying changes
jpmckinney Nov 11, 2024
3fb90a2
build: Upgrade opencivicdata
jpmckinney Nov 11, 2024
fef5d3f
ca_on_wilmot: Skip executive officer
jpmckinney Nov 11, 2024
1e96ba1
ca_ns_halifax: Allow Jean St.Amand
jpmckinney Nov 11, 2024
d6249fc
ca_on_kawartha_lakes: Ignore names including content like "RESIGNED A…
jpmckinney Nov 11, 2024
e50e780
ca_on_thunder_bay: Fix SSL error
jpmckinney Nov 12, 2024
744835b
ca_yt: Set user-agent and cookie (DEFAULT_USER_AGENT and cookie from …
jpmckinney Nov 12, 2024
b5c125a
ca_on_thunder_bay: patching requests didn't work on Heroku
jpmckinney Nov 12, 2024
d68412c
ca_ns_cape_breton: Escape quotation marks in name
jpmckinney Nov 12, 2024
b48c025
ca_yt: Add comment about Cloudflare bot products
jpmckinney Nov 12, 2024
6ceb72b
Update people.py
bzhangjma Nov 18, 2024
adff484
Update people.py
bzhangjma Nov 18, 2024
4c6e6d4
Update people.py
bzhangjma Nov 18, 2024
4f8f4a7
chore: Remove unused imports
jpmckinney Nov 18, 2024
6de91ee
ca_bc: Use multiline string for readability
jpmckinney Nov 18, 2024
05054d9
ca_bc: Fix validation to allow "Hon Chan" and "A'aliya
jpmckinney Nov 18, 2024
d1f8d1e
Merge branch 'ca_bc_fix_2'
jpmckinney Nov 18, 2024
8f5acfb
Removed unused function
samJMA Nov 20, 2024
83a5a04
Fix Kirkland scraper
rafe-murray May 30, 2024
01536a3
Similar solution to burnaby web scraper.
samJMA Oct 29, 2024
42571bd
Fixed email encryption error
samJMA Nov 8, 2024
d3894c4
Removed unused function
samJMA Nov 20, 2024
ac99311
Merge branch 'kirkland_scraper' of https://github.com/JMAConsulting/s…
samJMA Nov 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 46 additions & 61 deletions ca_bc/people.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,61 @@
import re
import json
from textwrap import dedent

from utils import CanadianPerson as Person
from utils import CanadianScraper

COUNCIL_PAGE = "https://www.leg.bc.ca/_api/search/query?querytext='(contentclass:sts_listitem%20OR%20IsDocument:True)%20SPSiteUrl:/content%20ListId:8ecafcaa-2bf9-4434-a60c-3663a9afd175%20MLAActiveOWSBOOL:1%20-LastNameOWSTEXT:Vacant'&selectproperties='Picture1OWSIMGE,Title,Path'&&sortlist='LastNameSort:ascending'&rowlimit=100&QueryTemplatePropertiesUrl='spfile://webroot/queryparametertemplate.xml'"
COUNCIL_PAGE = "https://www.leg.bc.ca/members"


class BritishColumbiaPersonScraper(CanadianScraper):
def scrape(self):
parties = {
"BC NDP": "New Democratic Party of British Columbia",
"BC Liberal Party": "British Columbia Liberal Party",
}
response = self.post(url="https://lims.leg.bc.ca/graphql", json={
"query": dedent("""\
query GetMLAsByConstituency($parliamentId: Int!) {
allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {
nodes {
image: imageBySmallImageId {
path
description
__typename
}
constituency: constituencyByConstituencyId {
name
__typename
}
member: memberByMemberId {
firstName
lastName
__typename
}
isCounsel
isDoctor
isHonourable
party: partyByPartyId {
name
abbreviation
__typename
}
nodeId
__typename
}
__typename
}
}"""
),
"variables": {"parliamentId": 43},
})
data = json.loads(response.content.decode("utf-8"))
members = data["data"]["allMemberParliaments"]["nodes"]

page = self.lxmlize(COUNCIL_PAGE, xml=True)

nsmap = {"d": "http://schemas.microsoft.com/ado/2007/08/dataservices"}
members = page.xpath("//d:Cells", namespaces=nsmap)
assert len(members), "No members found"
for member in members:
url = member.xpath('./d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[
0
]
if "vacant" in url.lower():
continue
page = self.lxmlize(url)
image = "https://lims.leg.bc.ca/public" + member["image"]["path"]
district = member["constituency"]["name"]
name = member["member"]["firstName"] + " " + member["member"]["lastName"]
party = member["party"]["name"]

name = (
page.xpath('//div[contains(@class, "BCLASS-pagetitle")]//h3/text()')[0]
.replace("Wm.", "")
.replace(", Q.C.", "")
.replace(", K.C.", "")
.strip()
)
district, party = cleanup_list(page.xpath('//div[@id="MinisterTitle"]/following-sibling::text()'))
party = parties.get(party, party)
p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party, image=image)
p.add_source(COUNCIL_PAGE)
p.add_source(url)

p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0]

email = page.xpath('//div[@class="convertToEmail"]//text()')[0].strip()
if "#" in email:
email = email.split("#")[0]
if email:
p.add_contact("email", email)

office = ", ".join(cleanup_list(page.xpath('//h4[contains(text(), "Office:")]/ancestor::div/text()')))
office = re.sub(r"\s{2,}", " ", office)
p.add_contact("address", office, "legislature")

constituency = ", ".join(
cleanup_list(page.xpath('//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()'))
)
constituency = re.sub(r"\s{2,}", " ", constituency).split(", Phone")[0]
p.add_contact("address", constituency, "constituency")

phones = cleanup_list(page.xpath('//span[contains(text(), "Phone:")]/following-sibling::text()'))

office_phone = phones[0]
p.add_contact("voice", office_phone, "legislature")
if len(phones) > 1:
constituency_phone = phones[1]
p.add_contact("voice", constituency_phone, "constituency")

roles = page.xpath('//div[@id="MinisterTitle"]/text()')[0].strip()
if roles:
p.extras["roles"] = [roles]

yield p


def cleanup_list(dirty_list):
return list(filter(None, (x.strip() for x in dirty_list)))
3 changes: 2 additions & 1 deletion ca_ns_cape_breton/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def decode_email(script):
councillor_url = councillor.xpath(".//a/@href")[0]
p.add_source(councillor_url)
page = self.lxmlize(councillor_url, user_agent=CUSTOM_USER_AGENT)
image = page.xpath(f'//img[contains(@title, "{name}")]/@src')
escaped_name = name.replace('"', """)
image = page.xpath(f'//img[contains(@title, "{escaped_name}")]/@src')
if image:
p.image = image[0]
yield p
Expand Down
3 changes: 3 additions & 0 deletions ca_on_kawartha_lakes/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def scrape(self):
name = councillor.text_content().replace("Mayor", "").strip()
role = "Mayor"

if "RESIGNED" in name:
continue

info_node = councillor.xpath("./following-sibling::*")[0]
email = self.get_email(info_node)
phone = self.get_phone(info_node)
Expand Down
13 changes: 4 additions & 9 deletions ca_on_thunder_bay/people.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import requests

from utils import DEFAULT_USER_AGENT, CanadianScraper
from utils import CanadianScraper
from utils import CanadianPerson as Person

COUNCIL_PAGE = "https://www.thunderbay.ca/en/city-hall/mayor-and-council-profiles.aspx"
Expand All @@ -9,13 +7,14 @@
class ThunderBayPersonScraper(CanadianScraper):
def scrape(self):
seat_number = 1
page = self.lxmlize(COUNCIL_PAGE)
# SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1133)'))
page = self.lxmlize(COUNCIL_PAGE, verify=False)

councillors = page.xpath("//p[@class='Center']/a[@href]")
assert len(councillors), "No councillors found"
for councillor in councillors:
url = councillor.xpath("./@href")[0]
councillor_page = self.lxmlize(url)
councillor_page = self.lxmlize(url, verify=False)
info = councillor_page.xpath("//div[@class='iCreateDynaToken']")[1]
role = info.xpath("./h2")[0].text_content()
name = info.xpath("./h3")[0].text_content()
Expand All @@ -42,7 +41,3 @@ def scrape(self):
p.image = photo

yield p

def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key
return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml)
3 changes: 3 additions & 0 deletions ca_on_wilmot/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ def scrape(self):
role_name, contact_info = councillors[i], councillors[i + 1]
role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ")

if "Executive Officer to the Mayor and Council" in role:
continue

# "Ward 1 Councillor"
if "Councillor" in role:
district = role.split(" Councillor")[0]
Expand Down
8 changes: 6 additions & 2 deletions ca_qc_kirkland/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,18 @@ def scrape(self):

name = councillor.xpath(".//strong/text()")[0]

# Using self.get_phone does not include the extension #
phone = (
councillor.xpath('.//div[contains(text(), "#")]/text()')[0]
.replace("T ", "")
.replace(" ", "-")
.replace(".", ",") # correcting a typo
.replace(".", ",")
.replace(",-#-", " x")
)
email = self.get_email(councillor)

# cloudflare encrypts the email data
encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0]
email = self._cloudflare_decode(encrypted_email)

p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
Expand Down
12 changes: 7 additions & 5 deletions ca_qc_montreal_est/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,22 @@
class MontrealEstPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
councillors = page.xpath("//div[contains(@id, 'membres-conseil-block_')]")
councillors = page.xpath(
'//div[contains (@class, "membreimg text-center membres-conseil")]//div//div[@class="col-lg-6"]'
)
assert len(councillors), "No councillors found"
for councillor in councillors:
name, role_district = councillor.xpath(".//span[@class='h3 d-block']")[0].text_content().split(" – ", 1)
name, role_district = councillor.xpath('.//div[@class="bg-trans-gris"]/span/text()')[0].split(" – ", 1)

if "Maire" in role_district:
if "Maire" in role_district or "Mairesse" in role_district:
district = "Montréal-Est"
role = "Maire"
else:
district = f"District {role_district[-1]}"
role = "Conseiller"

p = Person(primary_org="legislature", name=name, district=district, role=role)
p.image = councillor.xpath(".//@data-lazy-src")[0]
p = Person(primary_org="legislature", name=name.strip(), district=district, role=role)
p.image = councillor.xpath(".//div[not(@id)]/img//@src")[0]
p.add_contact("email", self.get_email(councillor))
p.add_source(COUNCIL_PAGE)
yield p
8 changes: 4 additions & 4 deletions ca_qc_sainte_anne_de_bellevue/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ class SainteAnneDeBellevuePersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)

councillors = page.xpath('//div[@class="block text"]')
councillors = page.xpath('//p[a[contains(@href, "@")]]')
assert len(councillors), "No councillors found"

for councillor in councillors:
name = councillor.xpath('.//div[@class="content-writable"]//strong/text()')[0]
district = councillor.xpath(".//h2/text()")[0]
name = councillor.text_content().split(" |", 1)[0]
district = councillor.xpath("./preceding-sibling::h2[1]/text()")[0]

if "Maire" in district:
district = "Sainte-Anne-de-Bellevue"
Expand All @@ -26,6 +27,5 @@ def scrape(self):
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)

p.image = councillor.xpath(".//@src")[0]
p.add_contact("email", self.get_email(councillor))
yield p
2 changes: 2 additions & 0 deletions ca_yt/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
COUNCIL_PAGE = "https://yukonassembly.ca/mlas"


# This website uses Cloudflare bot products (setting a __cf_bm cookie), which is hard to circumvent.
# https://developers.cloudflare.com/fundamentals/reference/policies-compliances/cloudflare-cookies/
class YukonPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
Expand Down
6 changes: 3 additions & 3 deletions patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,17 @@
r"(?:Jr|Rev|Sr|St)\.|"
r"da|de|den|der|la|van|von|"
r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|'
r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|"
r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|"
r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|"
r"Ch'ng|Prud'homme|"
r"A'aliya|Ch'ng|Prud'homme|"
r"D!ONNE|IsaBelle|Ya'ara"
r")"
)

# Name components can be joined by apostrophes, hyphens or spaces.
person_schema["properties"]["name"]["pattern"] = re.compile(
r"\A"
r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
r"(?:" + name_fragment + r"(?:'|-| - | )"
r")+" + name_fragment + r"\Z"
)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ lxml==4.9.1
# via -r requirements.in
olefile==0.47
# via agate-excel
opencivicdata==3.3.1
opencivicdata==3.4.0
# via
# -r requirements.in
# pupa
Expand Down
17 changes: 10 additions & 7 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,17 @@ def get_link(self, node, substring, *, error=True):
return None

def get(self, *args, **kwargs):
return super().get(*args, verify=SSL_VERIFY, **kwargs)
return super().get(*args, verify=kwargs.pop("verify", SSL_VERIFY), **kwargs)

def post(self, *args, **kwargs):
return super().post(*args, verify=SSL_VERIFY, **kwargs)
return super().post(*args, verify=kwargs.pop("verify", SSL_VERIFY), **kwargs)

def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False, verify=SSL_VERIFY):
# Sets User-Agent header.
# https://github.com/jamesturk/scrapelib/blob/5ce0916/scrapelib/__init__.py#L505
self.user_agent = user_agent

response = self.get(url, cookies=cookies)
response = self.get(url, cookies=cookies, verify=verify)
if encoding:
response.encoding = encoding

Expand Down Expand Up @@ -737,9 +739,10 @@ def clean_string(s):


def clean_name(s):
return honorific_suffix_re.sub(
"", honorific_prefix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip())
)
name = honorific_suffix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip())
if name.count(" ") > 1:
return honorific_prefix_re.sub("", name) # Avoid truncating names like "Hon Chan"
return name


def clean_type_id(type_id):
Expand Down