Skip to content

Commit

Permalink
make the ccew outputs for local authorities s3 uploads
Browse files Browse the repository at this point in the history
  • Loading branch information
drkane committed Dec 4, 2023
1 parent 1ccc6eb commit 4f966f6
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 113 deletions.
162 changes: 74 additions & 88 deletions charity/management/commands/output_ccew.py
Original file line number Diff line number Diff line change
@@ -1,111 +1,97 @@
import xlsxwriter
import io

from boto3 import session
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import connections

from charity.views import export_to_sqlite, export_to_xlsx

from charity.models import (
CCEWCharity,
CCEWCharityAnnualReturnHistory,
CCEWCharityAreaOfOperation,
CCEWCharityARPartA,
CCEWCharityARPartB,
CCEWCharityClassification,
CCEWCharityEventHistory,
CCEWCharityGoverningDocument,
CCEWCharityOtherNames,
CCEWCharityOtherRegulators,
CCEWCharityPolicy,
CCEWCharityPublishedReport,
CCEWCharityTrustee,
)
AREAS_QUERY = """
SELECT DISTINCT "{geo_field}" AS codes
FROM ftc_organisationlocation fo
WHERE "{geo_field}" IS NOT NULL
"""


class Command(BaseCommand):
help = "Export excel data from CCEW for a given area"
ccew_tables = [
CCEWCharity,
CCEWCharityAnnualReturnHistory,
CCEWCharityAreaOfOperation,
CCEWCharityARPartA,
CCEWCharityARPartB,
CCEWCharityClassification,
CCEWCharityEventHistory,
CCEWCharityGoverningDocument,
CCEWCharityOtherNames,
CCEWCharityOtherRegulators,
CCEWCharityPolicy,
CCEWCharityPublishedReport,
CCEWCharityTrustee,
]

def add_arguments(self, parser):
parser.add_argument("area", type=str, help="Area to export data for")
parser.add_argument("filename", type=str, help="Filename to export to")
parser.add_argument("area", type=str, nargs="+", help="Area to export data for")
parser.add_argument(
"--filename",
type=str,
default="data/{geo_field}/{geocode}",
help="Filename to export to. Don't include extension",
)
parser.add_argument(
"--geo-field",
type=str,
default="geo_laua",
help="Geo field to use for filtering",
)
parser.add_argument(
"--filetype",
type=str,
default="xlsx",
help="Filetype for export (xlsx or sqlite)",
)
parser.add_argument(
"--upload-to-storage",
action="store_true",
help="Upload file to S3 or compatible storage after export",
)

def handle(self, *args, **options):
base_query = """
WITH a AS (
SELECT DISTINCT replace(org_id, 'GB-CHC-', '')::INTEGER AS charity_number
FROM ftc_organisationlocation fo
WHERE "org_id" LIKE 'GB-CHC-%%'
AND "{geo_field}" = %(area)s
)
SELECT c.*
FROM "{charity_table}" c
INNER JOIN a
ON c.registered_charity_number = a.charity_number
"""
self.cursor = connections["data"].cursor()
self.s3_client = None

workbook = xlsxwriter.Workbook(
options["filename"], {"default_date_format": "yyyy-mm-dd"}
)
for table in self.ccew_tables:
query = base_query.format(
charity_table=table._meta.db_table, geo_field=options["geo_field"]
)
self.stdout.write(f"Exporting {table._meta.db_table}")
worksheet = workbook.add_worksheet(
table._meta.db_table.replace("charity_ccew", "")
if options["upload_to_storage"]:
s3_session = session.Session()
self.s3_client = s3_session.client(
"s3",
region_name=settings.S3_REGION,
endpoint_url=settings.S3_ENDPOINT,
aws_access_key_id=settings.S3_ACCESS_ID,
aws_secret_access_key=settings.S3_SECRET_KEY,
)

columns = [
column.name
for column in table._meta.get_fields()
if column.name != "id"
]
table_data = []

for row_index, row in enumerate(
table.objects.raw(query, {"area": options["area"]})
):
record = {
k: v for k, v in row.__dict__.items() if k not in ("_state", "id")
}
table_data.append([record.get(column, "") for column in columns])
if "all" in options["area"]:
self.cursor.execute(AREAS_QUERY.format(geo_field=options["geo_field"]))
areas = [row[0] for row in self.cursor.fetchall()]
else:
areas = options["area"]

if len(table_data) == 0:
table_data = [[None for _ in columns]]
if len(areas) > 1:
if "{geocode}" not in options["filename"]:
raise ValueError(
"Filename must contain {geocode} when exporting multiple areas"
)

worksheet.add_table(
0,
0,
len(table_data),
len(columns) - 1,
{
"data": table_data,
"columns": [
{
"header": column,
}
for column in columns
],
"name": table._meta.db_table.replace("charity_ccew", ""),
},
for area in areas:
filename = options["filename"].format(
geocode=area, geo_field=options["geo_field"]
)
worksheet.autofit()
print(filename)
if options["filetype"] == "xlsx":
filename += ".xlsx"
data = export_to_xlsx(options["geo_field"], area)
elif options["filetype"] == "sqlite":
filename += ".sqlite"
data = export_to_sqlite(options["geo_field"], area)
else:
raise ValueError("Invalid filetype")

workbook.close()
if self.s3_client:
self.s3_client.upload_fileobj(
io.BytesIO(data),
settings.S3_BUCKET,
filename,
ExtraArgs={"ACL": "public-read"},
)
print(f"{filename} uploaded to s3")
else:
with open(filename, "wb") as f:
f.write(data)
print(f"{filename} saved")
10 changes: 10 additions & 0 deletions charity/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@
views.export_data,
{"filetype": "xlsx"},
),
path(
"export/area/<str:geoarea>/<str:geocode>.sqlite",
views.export_data,
{"filetype": "sqlite"},
),
path(
"export/area/<str:geoarea>/<str:geocode>.db",
views.export_data,
{"filetype": "sqlite"},
),
path("<str:regno>.json", views.get_charity, {"filetype": "json"}),
path("<str:regno>.html", views.get_charity, {"filetype": "html"}),
path("<str:regno>", views.get_charity, {"filetype": "html"}, name="charity_html"),
Expand Down
79 changes: 56 additions & 23 deletions charity/views.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import io
import os
from tempfile import NamedTemporaryFile

import xlsxwriter
from django.http import HttpResponse
from django.shortcuts import redirect
from sqlite_utils import Database

from charity.models import (
CCEWCharity,
Expand Down Expand Up @@ -39,6 +42,12 @@
]


GEOAREAS = {
"la": "geo_laua",
"rgn": "geo_rgn",
}


def get_charity(request, regno, filetype="html", preview=False):
org_id = regno_to_orgid(regno)

Expand All @@ -48,52 +57,56 @@ def get_charity(request, regno, filetype="html", preview=False):
return get_org_by_id(request, org_id, filetype, preview, as_charity=True)


def export_data(request, geoarea, geocode, filetype="html"):
geoareas = {
"la": "geo_laua",
"rgn": "geo_rgn",
}
geoarea = geoareas.get(geoarea.lower(), "geo_laua")

def get_export_data(geofield, geocode):
base_query = """
WITH a AS (
SELECT DISTINCT replace(org_id, 'GB-CHC-', '')::INTEGER AS charity_number
FROM ftc_organisationlocation fo
WHERE "org_id" LIKE 'GB-CHC-%%'
AND "{geo_field}" = %(area)s
AND "{geo_field}" IN %(area)s
)
SELECT c.*
FROM "{charity_table}" c
INNER JOIN a
ON c.registered_charity_number = a.charity_number
"""

buffer = io.BytesIO()
workbook = xlsxwriter.Workbook(
buffer,
{"default_date_format": "yyyy-mm-dd", "in_memory": True},
)
for table in ccew_tables:
query = base_query.format(charity_table=table._meta.db_table, geo_field=geoarea)
print(f"Exporting {table._meta.db_table}")
worksheet = workbook.add_worksheet(
table._meta.db_table.replace("charity_ccew", "")
query = base_query.format(
charity_table=table._meta.db_table, geo_field=geofield
)
print(f"Exporting {table._meta.db_table}")

columns = [
column.name for column in table._meta.get_fields() if column.name != "id"
]
table_data = []

for row_index, row in enumerate(table.objects.raw(query, {"area": geocode})):
for row_index, row in enumerate(
table.objects.raw(query, {"area": tuple(geocode.split("+"))})
):
record = {
k: v for k, v in row.__dict__.items() if k not in ("_state", "id")
}
table_data.append([record.get(column, "") for column in columns])

yield table, columns, table_data


def export_to_xlsx(geoarea, geocode):
buffer = io.BytesIO()
workbook = xlsxwriter.Workbook(
buffer,
{"default_date_format": "yyyy-mm-dd", "in_memory": True},
)

for table, columns, table_data in get_export_data(geoarea, geocode):
if len(table_data) == 0:
table_data = [[None for _ in columns]]

worksheet = workbook.add_worksheet(
table._meta.db_table.replace("charity_ccew", "")
)
worksheet.add_table(
0,
0,
Expand All @@ -113,10 +126,30 @@ def export_data(request, geoarea, geocode, filetype="html"):
worksheet.autofit()

workbook.close()
return buffer.getvalue()


def export_to_sqlite(geoarea, geocode):
# create a temporary file
tmpfile = NamedTemporaryFile(suffix=".db", delete=False)
tmpfile.close()

db = Database(tmpfile.name)

for table, columns, table_data in get_export_data(geoarea, geocode):
db_table = db[table._meta.db_table.replace("charity_ccew", "")]
db_table.insert_all([dict(zip(columns, row)) for row in table_data])
db.close()

with open(tmpfile.name, "rb") as output_file:
data = output_file.read()

os.unlink(tmpfile.name)
return data


response = HttpResponse(
buffer.getvalue(),
content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
def export_data(request, geoarea, geocode, filetype="xlsx"):
s3_file_url = "https://findthatcharity.ams3.cdn.digitaloceanspaces.com/findthatcharity/data/{geoarea}/{geocode}.{filetype}".format(
geoarea=geoarea, geocode=geocode, filetype=filetype
)
response["Content-Disposition"] = f"attachment; filename=ccew_export_{geocode}.xlsx"
return response
return redirect(s3_file_url, permanent=True)
6 changes: 6 additions & 0 deletions findthatcharity/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,3 +322,9 @@
EMAIL_PORT = os.environ.get("EMAIL_PORT")
EMAIL_USE_SSL = os.environ.get("EMAIL_USE_SSL") == "True"
EMAIL_USE_SSL = os.environ.get("EMAIL_USE_SSL") == "True"

S3_REGION = os.environ.get("S3_REGION")
S3_ENDPOINT = os.environ.get("S3_ENDPOINT")
S3_ACCESS_ID = os.environ.get("S3_ACCESS_ID")
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY")
S3_BUCKET = os.environ.get("S3_BUCKET")
3 changes: 2 additions & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,5 @@ ua-parser
sqlite-utils
git+https://github.com/kanedata/[email protected]
certifi
xlsxwriter
xlsxwriter
boto3
Loading

0 comments on commit 4f966f6

Please sign in to comment.