updates

kanedata · Apr 30, 2024 · fc76b69 · fc76b69
1 parent be3f244
commit fc76b69
Show file tree

Hide file tree

Showing 31 changed files with 1,018 additions and 418 deletions.
diff --git a/findthatpostcode/commands/__init__.py b/findthatpostcode/commands/__init__.py
@@ -2,7 +2,7 @@
 
 from findthatpostcode.db import init_db
 
-from . import boundaries, codes, placenames, postcodes, stats
+from . import boundaries, codes, placenames, postcodes, stats, utils
 
 
 @click.command("init-db")
@@ -32,4 +32,13 @@ def import_group():
 import_group.add_command(stats.import_imd2015)
 import_group.add_command(stats.import_imd2019)
 
+
+@cli.group(name="utils")
+def utils_group():
+    pass
+
+
+utils_group.add_command(utils.sample_zip)
+
+
 cli.add_command(init_db_command)
diff --git a/findthatpostcode/commands/postcodes.py b/findthatpostcode/commands/postcodes.py
@@ -1,6 +1,7 @@
 """
 Import commands for the register of geographic codes and code history database
 """
+
 import csv
 import io
 import zipfile
@@ -11,7 +12,7 @@
 from tqdm import tqdm
 
 from findthatpostcode import db, settings
-from findthatpostcode.documents import Postcode
+from findthatpostcode.documents import Postcode, PostcodeSource
 from findthatpostcode.utils import BulkImporter
 
 PC_INDEX = Postcode.Index.name
@@ -22,6 +23,64 @@
 @click.option("--url", default=settings.NSPL_URL)
 @click.option("--file", default=None)
 def import_nspl(url=settings.NSPL_URL, es_index=PC_INDEX, file=None):
+    return import_from_postcode_file(
+        url=url,
+        es_index=es_index,
+        file=file,
+        filetype=PostcodeSource.NSPL,
+        file_location="Data/multi_csv/NSPL",
+    )
+
+
+@click.command("onspd")
+@click.option("--es-index", default=PC_INDEX)
+@click.option("--url", default=settings.ONSPD_URL)
+@click.option("--file", default=None)
+def import_onspd(url=settings.ONSPD_URL, es_index=PC_INDEX, file=None):
+    return import_from_postcode_file(
+        url=url,
+        es_index=es_index,
+        file=file,
+        filetype=PostcodeSource.ONSPD,
+        file_location="Data/multi_csv/ONSPD",
+    )
+
+
+@click.command("nhspd")
+@click.option("--es-index", default=PC_INDEX)
+@click.option("--url", default=settings.NHSPD_URL)
+@click.option("--file", default=None)
+def import_nhspd(url=settings.NHSPD_URL, es_index=PC_INDEX, file=None):
+    return import_from_postcode_file(
+        url=url,
+        es_index=es_index,
+        file=file,
+        filetype=PostcodeSource.NHSPD,
+        file_location="Data/",
+    )
+
+
+@click.command("pcon")
+@click.option("--es-index", default=PC_INDEX)
+@click.option("--url", default=settings.PCON_URL)
+@click.option("--file", default=None)
+def import_pcon(url=settings.PCON_URL, es_index=PC_INDEX, file=None):
+    return import_from_postcode_file(
+        url=url,
+        es_index=es_index,
+        file=file,
+        filetype=PostcodeSource.PCON,
+        file_location="pcd_pcon_",
+    )
+
+
+def import_from_postcode_file(
+    url=settings.NSPL_URL,
+    es_index=PC_INDEX,
+    file=None,
+    filetype: PostcodeSource = PostcodeSource.NSPL,
+    file_location: str = "Data/multi_csv/NSPL",
+):
     if settings.DEBUG:
         requests_cache.install_cache()
 
@@ -35,25 +94,35 @@ def import_nspl(url=settings.NSPL_URL, es_index=PC_INDEX, file=None):
         r = requests.get(url, stream=True)
         z = zipfile.ZipFile(io.BytesIO(r.content))
 
+    fieldnames = None
+    if filetype == PostcodeSource.NHSPD:
+        fieldnames = settings.NHSPD_FIELDNAMES
+
     for f in z.filelist:
-        if not f.filename.endswith(".csv") or not f.filename.startswith(
-            "Data/multi_csv/NSPL"
-        ):
+        if not f.filename.endswith(".csv") or not f.filename.startswith(file_location):
             continue
 
         print(f"[postcodes] Opening {f.filename}")
 
         with z.open(f, "r") as pccsv, BulkImporter(es, name="postcodes") as importer:
             pccsv = io.TextIOWrapper(pccsv)
-            reader = csv.DictReader(pccsv)
+            reader = csv.DictReader(pccsv, fieldnames=fieldnames)
             for record in tqdm(reader):
+                if filetype == PostcodeSource.PCON:
+                    record = {
+                        "pcds": record["pcd"],
+                        "pcon25": record["pconcd"],
+                    }
+
                 importer.add(
                     {
                         "_index": es_index,
                         "_op_type": "update",
                         "_id": record["pcds"],
                         "doc_as_upsert": True,
-                        "doc": Postcode.from_csv(record).to_dict(),
+                        "doc": {
+                            filetype.value: Postcode.from_csv(record).to_dict(),
+                        },
                     }
                 )
 

diff --git a/findthatpostcode/commands/utils.py b/findthatpostcode/commands/utils.py
@@ -0,0 +1,27 @@
+import random
+import zipfile
+
+import click
+
+
+@click.command("sample-zip")
+@click.argument("input", type=click.Path(exists=True))
+@click.argument("output", type=click.Path())
+def sample_zip(input, output):
+    input_zip = zipfile.ZipFile(input)
+    output_zip = zipfile.ZipFile(output, "w", compression=zipfile.ZIP_DEFLATED)
+
+    for f in input_zip.filelist:
+        # if the file is not a CSV then just copy it across
+        if not f.filename.endswith(".csv"):
+            output_zip.writestr(f, input_zip.read(f))
+            continue
+
+        # if it's a CSV the read it in and write out a sample
+        with input_zip.open(f, "r") as pccsv:
+            lines = pccsv.readlines()
+            if len(lines) <= 1000:
+                output_lines = lines
+            else:
+                output_lines = [lines[0]] + random.sample(lines[1:], 100)
+            output_zip.writestr(f, b"".join(output_lines))
diff --git a/findthatpostcode/crud.py b/findthatpostcode/crud.py
@@ -59,8 +59,7 @@ def record_to_schema(
     name_fields: Optional[List[str]] = None,
     name_lookup: Optional[Dict[str, Optional[str]]] = None,
     **kwargs,
-) -> schemas.Area:
-    ...
+) -> schemas.Area: ...
 
 
 @overload
@@ -70,8 +69,7 @@ def record_to_schema(
     name_fields: Optional[List[str]] = None,
     name_lookup: Optional[Dict[str, Optional[str]]] = None,
     **kwargs,
-) -> schemas.Placename:
-    ...
+) -> schemas.Placename: ...
 
 
 @overload
@@ -81,8 +79,7 @@ def record_to_schema(
     name_fields: Optional[List[str]] = None,
     name_lookup: Optional[Dict[str, Optional[str]]] = None,
     **kwargs,
-) -> schemas.NearestPoint:
-    ...
+) -> schemas.NearestPoint: ...
 
 
 @overload
@@ -92,8 +89,7 @@ def record_to_schema(
     name_fields: Optional[List[str]] = None,
     name_lookup: Optional[Dict[str, Optional[str]]] = None,
     **kwargs,
-) -> schemas.Postcode:
-    ...
+) -> schemas.Postcode: ...
 
 
 def record_to_schema(

diff --git a/findthatpostcode/documents/__init__.py b/findthatpostcode/documents/__init__.py
@@ -1,6 +1,6 @@
 from .area import Area
 from .entity import Entity
 from .placename import Placename
-from .postcode import Postcode
+from .postcode import Postcode, PostcodeSource
 
-__all__ = ["Postcode", "Entity", "Area", "Placename"]
+__all__ = ["Postcode", "Entity", "Area", "Placename", "PostcodeSource"]
diff --git a/findthatpostcode/documents/postcode.py b/findthatpostcode/documents/postcode.py
@@ -1,5 +1,6 @@
 import datetime
 import hashlib
+from enum import Enum
 from typing import Any, Dict, List
 
 from elasticsearch_dsl import Document, field
@@ -8,6 +9,13 @@
 from findthatpostcode.utils import PostcodeStr
 
 
+class PostcodeSource(Enum):
+    NSPL = "nspl"
+    ONSPD = "onspd"
+    NHSPD = "nhspd"
+    PCON = "pcon"  # new parliamentary constituencies - separate lookup provided
+
+
 class Postcode(Document):
     pcd = field.Keyword()
     pcd2 = field.Keyword()
@@ -68,10 +76,14 @@ def area_codes(self) -> List[str]:
         return [f for f in self.to_dict().values() if isinstance(f, str)]
 
     @classmethod
-    def from_csv(cls, original_record: Dict[str, str]) -> "Postcode":
+    def from_csv(
+        cls,
+        original_record: Dict[str, str],
+    ) -> "Postcode":
         """Create a Postcode object from a NSPL record"""
         record: Dict[str, Any] = original_record.copy()
         postcode = PostcodeStr(record["pcds"])
+        record["pcds"] = str(postcode)
 
         # null any blank fields (or ones with a dummy code in)
         for k in record:
@@ -80,24 +92,36 @@ def from_csv(cls, original_record: Dict[str, str]) -> "Postcode":
 
         # date fields
         for date_field in ["dointr", "doterm"]:
-            if record[date_field]:
+            if record.get(date_field):
                 record[date_field] = datetime.datetime.strptime(
                     record[date_field], "%Y%m"
                 )
 
         # latitude and longitude
         for geo_field in ["lat", "long"]:
-            if record[geo_field]:
+            if record.get(geo_field):
                 record[geo_field] = float(record[geo_field])
                 if record[geo_field] == 99.999999:
                     record[geo_field] = None
-        if record["lat"] and record["long"]:
+        if record.get("lat") and record.get("long"):
             record["location"] = {"lat": record["lat"], "lon": record["long"]}
 
         # integer fields
-        for int_field in ["oseast1m", "osnrth1m", "usertype", "osgrdind", "imd"]:
-            if record[int_field]:
-                record[int_field] = int(record[int_field])
+        for int_field in [
+            "oseast1m",
+            "osnrth1m",
+            "oseast100m",
+            "osnrth100m",
+            "usertype",
+            "osgrdind",
+            "imd",
+        ]:
+            if record.get(int_field):
+                value = record[int_field].strip()
+                if value == "":
+                    record[int_field] = None
+                else:
+                    record[int_field] = int(value)
 
         # add postcode hash
         record["hash"] = hashlib.md5(