fixup! Update scripts to reference new SP scraper

mysociety · Apr 22, 2024 · 835292d · 835292d
1 parent f135873
commit 835292d
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 91 deletions.
diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 from .download import fetch_debates_for_dates
-from .parse import parse_downloaded, tidy_up_html
+from .parse import tidy_up_html
 from .convert import convert_to_twfy
 import click
 from pathlib import Path
@@ -25,96 +25,86 @@ def cli():
     pass
 
 
-@cli.command()
-@click.option("--date", help="isodate to fetch debates for", required=True)
-@click.option("--verbose", is_flag=True, help="Print verbose output")
-@click.option("--override", is_flag=True, help="Override existing files")
-def fetch_debates_on_date(date: str, verbose: bool = False, override: bool = False):
+def cache_dir_iterator(
+    cache_dir: Path,
+    start_date: datetime.date,
+    end_date: datetime.date,
+):
     """
-    Download transcripts from Scottish Parliament for a given date
+    Return an iterator of files in the cache_dir that are between the start and end date
     """
 
-    try:
-        datetime.date.fromisoformat(date)
-    except ValueError:
-        print(f"{date} is not a valid iso date")
-
-    for file in fetch_debates_for_dates(
-        date, date, verbose=verbose, cache_dir=cache_dir, override=override
-    ):
-        tidy_up_html(file)
-        convert_to_twfy(file, output_dir)
+    for file in cache_dir.glob("*.xml"):
+        # date is an iso date at the start of the filename
+        date = datetime.date.fromisoformat(file.stem[:10])
+        if start_date <= date <= end_date:
+            yield file
 
 
 @cli.command()
 @click.option(
     "--start-date", help="isodate to start fetching debates from", required=True
 )
 @click.option("--end-date", help="isodate to end fetching debates at", required=True)
+@click.option(
+    "--download",
+    is_flag=True,
+    help="Download the debates, pair with 'override' to redownload all files",
+)
+@click.option("--parse", is_flag=True, help="Parse the downloaded debates")
+@click.option("--convert", is_flag=True, help="Convert the parsed debates")
 @click.option("--verbose", is_flag=True, help="Print verbose output")
 @click.option("--override", is_flag=True, help="Override existing files")
-def fetch_debates_on_date_range(
-    start_date: str, end_date: str, verbose: bool = False, override: bool = False
+@click.option(
+    "--partial-file-name", help="Only parse/convert files that match this string"
+)
+def debates(
+    start_date: str,
+    end_date: str,
+    download: bool = False,
+    parse: bool = False,
+    convert: bool = False,
+    verbose: bool = False,
+    override: bool = False,
+    partial_file_name: str | None = None,
 ):
     """
-    Download transcripts from Scottish Parliament between a start and end date
+    Download transcripts from Scottish Parliament between a start and end date.
     """
+
     start = datetime.date.fromisoformat(start_date)
     end = datetime.date.fromisoformat(end_date)
-    for file in fetch_debates_for_dates(
-        start.isoformat(), end.isoformat(), verbose=verbose, cache_dir=cache_dir, override=override
-    ):
-        tidy_up_html(file)
 
-
-@cli.command()
-@click.option(
-    "--start-date", help="isodate to start fetching debates from", required=True
-)
-@click.option("--end-date", help="isodate to end fetching debates at", required=True)
-@click.option("--verbose", is_flag=True, help="Print verbose output")
-def parse_debates_on_date_range(
-    start_date: str, end_date: str, verbose: bool = False
-):
-    """
-    Parse and convert transcripts between a start and end date
-    """
-    start = datetime.datetime.fromisoformat(start_date)
-    end = datetime.datetime.fromisoformat(end_date)
-    for n in range(int((end - start).days) + 1):
-        date = (start + datetime.timedelta(n)).date().isoformat()
-        parse_downloaded(cache_dir=cache_dir, partial_file_name=date, verbose=verbose)
-        convert_to_twfy(
-            cache_dir=cache_dir,
-            output_dir=output_dir,
-            partial_file_name=date,
+    # if none of the flags are set, error that at least one flag must be set
+    if not any([download, parse, convert]):
+        click.echo("At least one of the flags must be set")
+        return
+
+    # iterate through downloaded files if we're downloading them
+    # otherwise go find the relevant files based on name
+    if download:
+        file_iterator = fetch_debates_for_dates(
+            start.isoformat(),
+            end.isoformat(),
             verbose=verbose,
+            cache_dir=cache_dir,
+            override=override,
         )
-
-
-@cli.command(name="parse-downloaded")
-@click.option("--verbose", is_flag=True, help="Print verbose output")
-@click.option("--pattern", help="Partial file name to parse, reparses all if blank")
-def cmd_parse_downloaded(verbose: bool = False, pattern: str | None = None):
-    """
-    Re-parse downloaded
-    """
-    parse_downloaded(cache_dir=cache_dir, partial_file_name=pattern, verbose=verbose)
-
-
-@cli.command(name="convert-twfy")
-@click.option("--verbose", is_flag=True, help="Print verbose output")
-@click.option("--pattern", help="Partial file name to parse, reparses all if blank")
-def cmd_convert_twfy(verbose: bool = False, pattern: str | None = None):
-    """
-    Re-convert files to TWFY format
-    """
-    convert_to_twfy(
-        cache_dir=cache_dir,
-        output_dir=output_dir,
-        partial_file_name=pattern,
-        verbose=verbose,
-    )
+    else:
+        file_iterator = cache_dir_iterator(cache_dir, start, end)
+
+    for file in file_iterator:
+        if partial_file_name:
+            if not file.name.startswith(partial_file_name):
+                continue
+        if parse:
+            if verbose:
+                print(f"Parsing up {file}")
+            tidy_up_html(file)
+        if convert:
+            if verbose:
+                print(f"Converting {file} to TheyWorkForYou format")
+            convert_to_twfy(file, output_dir, verbose=verbose)
 
 
 if __name__ == "__main__":

diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py
@@ -259,21 +259,4 @@ def tidy_up_html(xml_path: Path):
     etree.indent(root, space="    ")
 
     with xml_path.open("wb") as f:
-        f.write(etree.tostring(root, pretty_print=True))
-
-
-def parse_downloaded(
-    cache_dir: Path, partial_file_name: str | None = None, verbose: bool = False
-):
-    """
-    Given a cache directory, parse the raw_html elements in the xml files
-    This updates the 'parsed' element under each agenda-item.
-    """
-    if partial_file_name:
-        xmls = list(cache_dir.glob(f"{partial_file_name}*"))
-    else:
-        xmls = list(cache_dir.glob("*.xml"))
-    for xml in xmls:
-        if verbose:
-            print(f"Tidying up {xml}")
-        tidy_up_html(xml)
+        f.write(etree.tostring(root, pretty_print=True))
diff --git a/scripts/updatedaterange-parse b/scripts/updatedaterange-parse
@@ -27,7 +27,7 @@ pyscraper/wa/parse.py ~/parldata/cmpages/senedd ~/parldata/scrapedxml/senedd
 
 # Scottish Parliament:
 cd ~/parlparse/
-python -m pyscraper.sp_2024 parse-debates-on-date-range --start-date $FROMDATE --end-date $TODATE
+python -m pyscraper.sp_2024 debates --parse --convert --start-date $FROMDATE --end-date $TODATE
 
 # London Assembly questions
 #cd ~/parlparse/london-mayors-questions

diff --git a/scripts/updatedaterange-scrape b/scripts/updatedaterange-scrape
@@ -26,7 +26,7 @@ wa/scrape.py
 
 # Scottish Parliament
 cd ~/parlparse
-python -m pyscraper.sp_2024 fetch-debates-on-date-range --start-date $FROMDATE --end-date $TODATE
+python -m pyscraper.sp_2024 debates --download --start-date $FROMDATE --end-date $TODATE
 
 
 # Return error code