Skip to content

Commit

Permalink
fixup! Update scripts to reference new SP scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
ajparsons committed Apr 22, 2024
1 parent f135873 commit 835292d
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 91 deletions.
132 changes: 61 additions & 71 deletions pyscraper/sp_2024/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from __future__ import annotations

from .download import fetch_debates_for_dates
from .parse import parse_downloaded, tidy_up_html
from .parse import tidy_up_html
from .convert import convert_to_twfy
import click
from pathlib import Path
Expand All @@ -25,96 +25,86 @@ def cli():
pass


@cli.command()
@click.option("--date", help="isodate to fetch debates for", required=True)
@click.option("--verbose", is_flag=True, help="Print verbose output")
@click.option("--override", is_flag=True, help="Override existing files")
def fetch_debates_on_date(date: str, verbose: bool = False, override: bool = False):
def cache_dir_iterator(
cache_dir: Path,
start_date: datetime.date,
end_date: datetime.date,
):
"""
Download transcripts from Scottish Parliament for a given date
Return an iterator of files in the cache_dir that are between the start and end date
"""

try:
datetime.date.fromisoformat(date)
except ValueError:
print(f"{date} is not a valid iso date")

for file in fetch_debates_for_dates(
date, date, verbose=verbose, cache_dir=cache_dir, override=override
):
tidy_up_html(file)
convert_to_twfy(file, output_dir)
for file in cache_dir.glob("*.xml"):
# date is an iso date at the start of the filename
date = datetime.date.fromisoformat(file.stem[:10])
if start_date <= date <= end_date:
yield file


@cli.command()
@click.option(
"--start-date", help="isodate to start fetching debates from", required=True
)
@click.option("--end-date", help="isodate to end fetching debates at", required=True)
@click.option(
"--download",
is_flag=True,
help="Download the debates, pair with 'override' to redownload all files",
)
@click.option("--parse", is_flag=True, help="Parse the downloaded debates")
@click.option("--convert", is_flag=True, help="Convert the parsed debates")
@click.option("--verbose", is_flag=True, help="Print verbose output")
@click.option("--override", is_flag=True, help="Override existing files")
def fetch_debates_on_date_range(
start_date: str, end_date: str, verbose: bool = False, override: bool = False
@click.option(
"--partial-file-name", help="Only parse/convert files that match this string"
)
def debates(
start_date: str,
end_date: str,
download: bool = False,
parse: bool = False,
convert: bool = False,
verbose: bool = False,
override: bool = False,
partial_file_name: str | None = None,
):
"""
Download transcripts from Scottish Parliament between a start and end date
Download transcripts from Scottish Parliament between a start and end date.
"""

start = datetime.date.fromisoformat(start_date)
end = datetime.date.fromisoformat(end_date)
for file in fetch_debates_for_dates(
start.isoformat(), end.isoformat(), verbose=verbose, cache_dir=cache_dir, override=override
):
tidy_up_html(file)


@cli.command()
@click.option(
"--start-date", help="isodate to start fetching debates from", required=True
)
@click.option("--end-date", help="isodate to end fetching debates at", required=True)
@click.option("--verbose", is_flag=True, help="Print verbose output")
def parse_debates_on_date_range(
start_date: str, end_date: str, verbose: bool = False
):
"""
Parse and convert transcripts between a start and end date
"""
start = datetime.datetime.fromisoformat(start_date)
end = datetime.datetime.fromisoformat(end_date)
for n in range(int((end - start).days) + 1):
date = (start + datetime.timedelta(n)).date().isoformat()
parse_downloaded(cache_dir=cache_dir, partial_file_name=date, verbose=verbose)
convert_to_twfy(
cache_dir=cache_dir,
output_dir=output_dir,
partial_file_name=date,
# if none of the flags are set, error that at least one flag must be set
if not any([download, parse, convert]):
click.echo("At least one of the flags must be set")
return

# iterate through downloaded files if we're downloading them
# otherwise go find the relevant files based on name
if download:
file_iterator = fetch_debates_for_dates(
start.isoformat(),
end.isoformat(),
verbose=verbose,
cache_dir=cache_dir,
override=override,
)


@cli.command(name="parse-downloaded")
@click.option("--verbose", is_flag=True, help="Print verbose output")
@click.option("--pattern", help="Partial file name to parse, reparses all if blank")
def cmd_parse_downloaded(verbose: bool = False, pattern: str | None = None):
"""
Re-parse downloaded
"""
parse_downloaded(cache_dir=cache_dir, partial_file_name=pattern, verbose=verbose)


@cli.command(name="convert-twfy")
@click.option("--verbose", is_flag=True, help="Print verbose output")
@click.option("--pattern", help="Partial file name to parse, reparses all if blank")
def cmd_convert_twfy(verbose: bool = False, pattern: str | None = None):
"""
Re-convert files to TWFY format
"""
convert_to_twfy(
cache_dir=cache_dir,
output_dir=output_dir,
partial_file_name=pattern,
verbose=verbose,
)
else:
file_iterator = cache_dir_iterator(cache_dir, start, end)

for file in file_iterator:
if partial_file_name:
if not file.name.startswith(partial_file_name):
continue
if parse:
if verbose:
print(f"Parsing up {file}")
tidy_up_html(file)
if convert:
if verbose:
print(f"Converting {file} to TheyWorkForYou format")
convert_to_twfy(file, output_dir, verbose=verbose)


if __name__ == "__main__":
Expand Down
19 changes: 1 addition & 18 deletions pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,21 +259,4 @@ def tidy_up_html(xml_path: Path):
etree.indent(root, space=" ")

with xml_path.open("wb") as f:
f.write(etree.tostring(root, pretty_print=True))


def parse_downloaded(
cache_dir: Path, partial_file_name: str | None = None, verbose: bool = False
):
"""
Given a cache directory, parse the raw_html elements in the xml files
This updates the 'parsed' element under each agenda-item.
"""
if partial_file_name:
xmls = list(cache_dir.glob(f"{partial_file_name}*"))
else:
xmls = list(cache_dir.glob("*.xml"))
for xml in xmls:
if verbose:
print(f"Tidying up {xml}")
tidy_up_html(xml)
f.write(etree.tostring(root, pretty_print=True))
2 changes: 1 addition & 1 deletion scripts/updatedaterange-parse
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pyscraper/wa/parse.py ~/parldata/cmpages/senedd ~/parldata/scrapedxml/senedd

# Scottish Parliament:
cd ~/parlparse/
python -m pyscraper.sp_2024 parse-debates-on-date-range --start-date $FROMDATE --end-date $TODATE
python -m pyscraper.sp_2024 debates --parse --convert --start-date $FROMDATE --end-date $TODATE

# London Assembly questions
#cd ~/parlparse/london-mayors-questions
Expand Down
2 changes: 1 addition & 1 deletion scripts/updatedaterange-scrape
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ wa/scrape.py

# Scottish Parliament
cd ~/parlparse
python -m pyscraper.sp_2024 fetch-debates-on-date-range --start-date $FROMDATE --end-date $TODATE
python -m pyscraper.sp_2024 debates --download --start-date $FROMDATE --end-date $TODATE


# Return error code
Expand Down

0 comments on commit 835292d

Please sign in to comment.