Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
drkane committed Apr 30, 2024
1 parent be3f244 commit fc76b69
Show file tree
Hide file tree
Showing 31 changed files with 1,018 additions and 418 deletions.
11 changes: 10 additions & 1 deletion findthatpostcode/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from findthatpostcode.db import init_db

from . import boundaries, codes, placenames, postcodes, stats
from . import boundaries, codes, placenames, postcodes, stats, utils


@click.command("init-db")
Expand Down Expand Up @@ -32,4 +32,13 @@ def import_group():
import_group.add_command(stats.import_imd2015)
import_group.add_command(stats.import_imd2019)


@cli.group(name="utils")
def utils_group():
pass


utils_group.add_command(utils.sample_zip)


cli.add_command(init_db_command)
81 changes: 75 additions & 6 deletions findthatpostcode/commands/postcodes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Import commands for the register of geographic codes and code history database
"""

import csv
import io
import zipfile
Expand All @@ -11,7 +12,7 @@
from tqdm import tqdm

from findthatpostcode import db, settings
from findthatpostcode.documents import Postcode
from findthatpostcode.documents import Postcode, PostcodeSource
from findthatpostcode.utils import BulkImporter

PC_INDEX = Postcode.Index.name
Expand All @@ -22,6 +23,64 @@
@click.option("--url", default=settings.NSPL_URL)
@click.option("--file", default=None)
def import_nspl(url=settings.NSPL_URL, es_index=PC_INDEX, file=None):
return import_from_postcode_file(
url=url,
es_index=es_index,
file=file,
filetype=PostcodeSource.NSPL,
file_location="Data/multi_csv/NSPL",
)


@click.command("onspd")
@click.option("--es-index", default=PC_INDEX)
@click.option("--url", default=settings.ONSPD_URL)
@click.option("--file", default=None)
def import_onspd(url=settings.ONSPD_URL, es_index=PC_INDEX, file=None):
return import_from_postcode_file(
url=url,
es_index=es_index,
file=file,
filetype=PostcodeSource.ONSPD,
file_location="Data/multi_csv/ONSPD",
)


@click.command("nhspd")
@click.option("--es-index", default=PC_INDEX)
@click.option("--url", default=settings.NHSPD_URL)
@click.option("--file", default=None)
def import_nhspd(url=settings.NHSPD_URL, es_index=PC_INDEX, file=None):
return import_from_postcode_file(
url=url,
es_index=es_index,
file=file,
filetype=PostcodeSource.NHSPD,
file_location="Data/",
)


@click.command("pcon")
@click.option("--es-index", default=PC_INDEX)
@click.option("--url", default=settings.PCON_URL)
@click.option("--file", default=None)
def import_pcon(url=settings.PCON_URL, es_index=PC_INDEX, file=None):
return import_from_postcode_file(
url=url,
es_index=es_index,
file=file,
filetype=PostcodeSource.PCON,
file_location="pcd_pcon_",
)


def import_from_postcode_file(
url=settings.NSPL_URL,
es_index=PC_INDEX,
file=None,
filetype: PostcodeSource = PostcodeSource.NSPL,
file_location: str = "Data/multi_csv/NSPL",
):
if settings.DEBUG:
requests_cache.install_cache()

Expand All @@ -35,25 +94,35 @@ def import_nspl(url=settings.NSPL_URL, es_index=PC_INDEX, file=None):
r = requests.get(url, stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))

fieldnames = None
if filetype == PostcodeSource.NHSPD:
fieldnames = settings.NHSPD_FIELDNAMES

for f in z.filelist:
if not f.filename.endswith(".csv") or not f.filename.startswith(
"Data/multi_csv/NSPL"
):
if not f.filename.endswith(".csv") or not f.filename.startswith(file_location):
continue

print(f"[postcodes] Opening {f.filename}")

with z.open(f, "r") as pccsv, BulkImporter(es, name="postcodes") as importer:
pccsv = io.TextIOWrapper(pccsv)
reader = csv.DictReader(pccsv)
reader = csv.DictReader(pccsv, fieldnames=fieldnames)
for record in tqdm(reader):
if filetype == PostcodeSource.PCON:
record = {
"pcds": record["pcd"],
"pcon25": record["pconcd"],
}

importer.add(
{
"_index": es_index,
"_op_type": "update",
"_id": record["pcds"],
"doc_as_upsert": True,
"doc": Postcode.from_csv(record).to_dict(),
"doc": {
filetype.value: Postcode.from_csv(record).to_dict(),
},
}
)

Expand Down
27 changes: 27 additions & 0 deletions findthatpostcode/commands/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import random
import zipfile

import click


@click.command("sample-zip")
@click.argument("input", type=click.Path(exists=True))
@click.argument("output", type=click.Path())
def sample_zip(input, output):
input_zip = zipfile.ZipFile(input)
output_zip = zipfile.ZipFile(output, "w", compression=zipfile.ZIP_DEFLATED)

for f in input_zip.filelist:
# if the file is not a CSV then just copy it across
if not f.filename.endswith(".csv"):
output_zip.writestr(f, input_zip.read(f))
continue

# if it's a CSV the read it in and write out a sample
with input_zip.open(f, "r") as pccsv:
lines = pccsv.readlines()
if len(lines) <= 1000:
output_lines = lines
else:
output_lines = [lines[0]] + random.sample(lines[1:], 100)
output_zip.writestr(f, b"".join(output_lines))
12 changes: 4 additions & 8 deletions findthatpostcode/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ def record_to_schema(
name_fields: Optional[List[str]] = None,
name_lookup: Optional[Dict[str, Optional[str]]] = None,
**kwargs,
) -> schemas.Area:
...
) -> schemas.Area: ...


@overload
Expand All @@ -70,8 +69,7 @@ def record_to_schema(
name_fields: Optional[List[str]] = None,
name_lookup: Optional[Dict[str, Optional[str]]] = None,
**kwargs,
) -> schemas.Placename:
...
) -> schemas.Placename: ...


@overload
Expand All @@ -81,8 +79,7 @@ def record_to_schema(
name_fields: Optional[List[str]] = None,
name_lookup: Optional[Dict[str, Optional[str]]] = None,
**kwargs,
) -> schemas.NearestPoint:
...
) -> schemas.NearestPoint: ...


@overload
Expand All @@ -92,8 +89,7 @@ def record_to_schema(
name_fields: Optional[List[str]] = None,
name_lookup: Optional[Dict[str, Optional[str]]] = None,
**kwargs,
) -> schemas.Postcode:
...
) -> schemas.Postcode: ...


def record_to_schema(
Expand Down
4 changes: 2 additions & 2 deletions findthatpostcode/documents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .area import Area
from .entity import Entity
from .placename import Placename
from .postcode import Postcode
from .postcode import Postcode, PostcodeSource

__all__ = ["Postcode", "Entity", "Area", "Placename"]
__all__ = ["Postcode", "Entity", "Area", "Placename", "PostcodeSource"]
38 changes: 31 additions & 7 deletions findthatpostcode/documents/postcode.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import hashlib
from enum import Enum
from typing import Any, Dict, List

from elasticsearch_dsl import Document, field
Expand All @@ -8,6 +9,13 @@
from findthatpostcode.utils import PostcodeStr


class PostcodeSource(Enum):
NSPL = "nspl"
ONSPD = "onspd"
NHSPD = "nhspd"
PCON = "pcon" # new parliamentary constituencies - separate lookup provided


class Postcode(Document):
pcd = field.Keyword()
pcd2 = field.Keyword()
Expand Down Expand Up @@ -68,10 +76,14 @@ def area_codes(self) -> List[str]:
return [f for f in self.to_dict().values() if isinstance(f, str)]

@classmethod
def from_csv(cls, original_record: Dict[str, str]) -> "Postcode":
def from_csv(
cls,
original_record: Dict[str, str],
) -> "Postcode":
"""Create a Postcode object from a NSPL record"""
record: Dict[str, Any] = original_record.copy()
postcode = PostcodeStr(record["pcds"])
record["pcds"] = str(postcode)

# null any blank fields (or ones with a dummy code in)
for k in record:
Expand All @@ -80,24 +92,36 @@ def from_csv(cls, original_record: Dict[str, str]) -> "Postcode":

# date fields
for date_field in ["dointr", "doterm"]:
if record[date_field]:
if record.get(date_field):
record[date_field] = datetime.datetime.strptime(
record[date_field], "%Y%m"
)

# latitude and longitude
for geo_field in ["lat", "long"]:
if record[geo_field]:
if record.get(geo_field):
record[geo_field] = float(record[geo_field])
if record[geo_field] == 99.999999:
record[geo_field] = None
if record["lat"] and record["long"]:
if record.get("lat") and record.get("long"):
record["location"] = {"lat": record["lat"], "lon": record["long"]}

# integer fields
for int_field in ["oseast1m", "osnrth1m", "usertype", "osgrdind", "imd"]:
if record[int_field]:
record[int_field] = int(record[int_field])
for int_field in [
"oseast1m",
"osnrth1m",
"oseast100m",
"osnrth100m",
"usertype",
"osgrdind",
"imd",
]:
if record.get(int_field):
value = record[int_field].strip()
if value == "":
record[int_field] = None
else:
record[int_field] = int(value)

# add postcode hash
record["hash"] = hashlib.md5(
Expand Down
Loading

0 comments on commit fc76b69

Please sign in to comment.