make the ccew outputs for local authorities s3 uploads

kanedata · Dec 4, 2023 · 4f966f6 · 4f966f6
1 parent 1ccc6eb
commit 4f966f6
Show file tree

Hide file tree

Showing 7 changed files with 165 additions and 113 deletions.
diff --git a/charity/management/commands/output_ccew.py b/charity/management/commands/output_ccew.py
@@ -1,111 +1,97 @@
-import xlsxwriter
+import io
+
+from boto3 import session
+from django.conf import settings
 from django.core.management.base import BaseCommand
+from django.db import connections
+
+from charity.views import export_to_sqlite, export_to_xlsx
 
-from charity.models import (
-    CCEWCharity,
-    CCEWCharityAnnualReturnHistory,
-    CCEWCharityAreaOfOperation,
-    CCEWCharityARPartA,
-    CCEWCharityARPartB,
-    CCEWCharityClassification,
-    CCEWCharityEventHistory,
-    CCEWCharityGoverningDocument,
-    CCEWCharityOtherNames,
-    CCEWCharityOtherRegulators,
-    CCEWCharityPolicy,
-    CCEWCharityPublishedReport,
-    CCEWCharityTrustee,
-)
+AREAS_QUERY = """
+SELECT DISTINCT "{geo_field}" AS codes
+FROM ftc_organisationlocation fo
+WHERE "{geo_field}" IS NOT NULL
+"""
 
 
 class Command(BaseCommand):
     help = "Export excel data from CCEW for a given area"
-    ccew_tables = [
-        CCEWCharity,
-        CCEWCharityAnnualReturnHistory,
-        CCEWCharityAreaOfOperation,
-        CCEWCharityARPartA,
-        CCEWCharityARPartB,
-        CCEWCharityClassification,
-        CCEWCharityEventHistory,
-        CCEWCharityGoverningDocument,
-        CCEWCharityOtherNames,
-        CCEWCharityOtherRegulators,
-        CCEWCharityPolicy,
-        CCEWCharityPublishedReport,
-        CCEWCharityTrustee,
-    ]
 
     def add_arguments(self, parser):
-        parser.add_argument("area", type=str, help="Area to export data for")
-        parser.add_argument("filename", type=str, help="Filename to export to")
+        parser.add_argument("area", type=str, nargs="+", help="Area to export data for")
+        parser.add_argument(
+            "--filename",
+            type=str,
+            default="data/{geo_field}/{geocode}",
+            help="Filename to export to. Don't include extension",
+        )
         parser.add_argument(
             "--geo-field",
             type=str,
             default="geo_laua",
             help="Geo field to use for filtering",
         )
+        parser.add_argument(
+            "--filetype",
+            type=str,
+            default="xlsx",
+            help="Filetype for export (xlsx or sqlite)",
+        )
+        parser.add_argument(
+            "--upload-to-storage",
+            action="store_true",
+            help="Upload file to S3 or compatible storage after export",
+        )
 
     def handle(self, *args, **options):
-        base_query = """
-        WITH a AS (
-            SELECT DISTINCT replace(org_id, 'GB-CHC-', '')::INTEGER AS charity_number
-            FROM ftc_organisationlocation fo 
-            WHERE "org_id" LIKE 'GB-CHC-%%'
-                AND "{geo_field}" = %(area)s
-        )
-        SELECT c.*
-        FROM "{charity_table}" c
-        INNER JOIN a
-        ON c.registered_charity_number = a.charity_number
-        """
+        self.cursor = connections["data"].cursor()
+        self.s3_client = None
 
-        workbook = xlsxwriter.Workbook(
-            options["filename"], {"default_date_format": "yyyy-mm-dd"}
-        )
-        for table in self.ccew_tables:
-            query = base_query.format(
-                charity_table=table._meta.db_table, geo_field=options["geo_field"]
-            )
-            self.stdout.write(f"Exporting {table._meta.db_table}")
-            worksheet = workbook.add_worksheet(
-                table._meta.db_table.replace("charity_ccew", "")
+        if options["upload_to_storage"]:
+            s3_session = session.Session()
+            self.s3_client = s3_session.client(
+                "s3",
+                region_name=settings.S3_REGION,
+                endpoint_url=settings.S3_ENDPOINT,
+                aws_access_key_id=settings.S3_ACCESS_ID,
+                aws_secret_access_key=settings.S3_SECRET_KEY,
             )
 
-            columns = [
-                column.name
-                for column in table._meta.get_fields()
-                if column.name != "id"
-            ]
-            table_data = []
-
-            for row_index, row in enumerate(
-                table.objects.raw(query, {"area": options["area"]})
-            ):
-                record = {
-                    k: v for k, v in row.__dict__.items() if k not in ("_state", "id")
-                }
-                table_data.append([record.get(column, "") for column in columns])
+        if "all" in options["area"]:
+            self.cursor.execute(AREAS_QUERY.format(geo_field=options["geo_field"]))
+            areas = [row[0] for row in self.cursor.fetchall()]
+        else:
+            areas = options["area"]
 
-            if len(table_data) == 0:
-                table_data = [[None for _ in columns]]
+        if len(areas) > 1:
+            if "{geocode}" not in options["filename"]:
+                raise ValueError(
+                    "Filename must contain {geocode} when exporting multiple areas"
+                )
 
-            worksheet.add_table(
-                0,
-                0,
-                len(table_data),
-                len(columns) - 1,
-                {
-                    "data": table_data,
-                    "columns": [
-                        {
-                            "header": column,
-                        }
-                        for column in columns
-                    ],
-                    "name": table._meta.db_table.replace("charity_ccew", ""),
-                },
+        for area in areas:
+            filename = options["filename"].format(
+                geocode=area, geo_field=options["geo_field"]
             )
-            worksheet.autofit()
+            print(filename)
+            if options["filetype"] == "xlsx":
+                filename += ".xlsx"
+                data = export_to_xlsx(options["geo_field"], area)
+            elif options["filetype"] == "sqlite":
+                filename += ".sqlite"
+                data = export_to_sqlite(options["geo_field"], area)
+            else:
+                raise ValueError("Invalid filetype")
 
-        workbook.close()
+            if self.s3_client:
+                self.s3_client.upload_fileobj(
+                    io.BytesIO(data),
+                    settings.S3_BUCKET,
+                    filename,
+                    ExtraArgs={"ACL": "public-read"},
+                )
+                print(f"{filename} uploaded to s3")
+            else:
+                with open(filename, "wb") as f:
+                    f.write(data)
+                print(f"{filename} saved")
diff --git a/charity/urls.py b/charity/urls.py
@@ -8,6 +8,16 @@
         views.export_data,
         {"filetype": "xlsx"},
     ),
+    path(
+        "export/area/<str:geoarea>/<str:geocode>.sqlite",
+        views.export_data,
+        {"filetype": "sqlite"},
+    ),
+    path(
+        "export/area/<str:geoarea>/<str:geocode>.db",
+        views.export_data,
+        {"filetype": "sqlite"},
+    ),
     path("<str:regno>.json", views.get_charity, {"filetype": "json"}),
     path("<str:regno>.html", views.get_charity, {"filetype": "html"}),
     path("<str:regno>", views.get_charity, {"filetype": "html"}, name="charity_html"),

diff --git a/charity/views.py b/charity/views.py
@@ -1,8 +1,11 @@
 import io
+import os
+from tempfile import NamedTemporaryFile
 
 import xlsxwriter
 from django.http import HttpResponse
 from django.shortcuts import redirect
+from sqlite_utils import Database
 
 from charity.models import (
     CCEWCharity,
@@ -39,6 +42,12 @@
 ]
 
 
+GEOAREAS = {
+    "la": "geo_laua",
+    "rgn": "geo_rgn",
+}
+
+
 def get_charity(request, regno, filetype="html", preview=False):
     org_id = regno_to_orgid(regno)
 
@@ -48,52 +57,56 @@ def get_charity(request, regno, filetype="html", preview=False):
     return get_org_by_id(request, org_id, filetype, preview, as_charity=True)
 
 
-def export_data(request, geoarea, geocode, filetype="html"):
-    geoareas = {
-        "la": "geo_laua",
-        "rgn": "geo_rgn",
-    }
-    geoarea = geoareas.get(geoarea.lower(), "geo_laua")
-
+def get_export_data(geofield, geocode):
     base_query = """
         WITH a AS (
             SELECT DISTINCT replace(org_id, 'GB-CHC-', '')::INTEGER AS charity_number
             FROM ftc_organisationlocation fo 
             WHERE "org_id" LIKE 'GB-CHC-%%'
-                AND "{geo_field}" = %(area)s
+                AND "{geo_field}" IN %(area)s
         )
         SELECT c.*
         FROM "{charity_table}" c
         INNER JOIN a
         ON c.registered_charity_number = a.charity_number
         """
 
-    buffer = io.BytesIO()
-    workbook = xlsxwriter.Workbook(
-        buffer,
-        {"default_date_format": "yyyy-mm-dd", "in_memory": True},
-    )
     for table in ccew_tables:
-        query = base_query.format(charity_table=table._meta.db_table, geo_field=geoarea)
-        print(f"Exporting {table._meta.db_table}")
-        worksheet = workbook.add_worksheet(
-            table._meta.db_table.replace("charity_ccew", "")
+        query = base_query.format(
+            charity_table=table._meta.db_table, geo_field=geofield
         )
+        print(f"Exporting {table._meta.db_table}")
 
         columns = [
             column.name for column in table._meta.get_fields() if column.name != "id"
         ]
         table_data = []
 
-        for row_index, row in enumerate(table.objects.raw(query, {"area": geocode})):
+        for row_index, row in enumerate(
+            table.objects.raw(query, {"area": tuple(geocode.split("+"))})
+        ):
             record = {
                 k: v for k, v in row.__dict__.items() if k not in ("_state", "id")
             }
             table_data.append([record.get(column, "") for column in columns])
 
+        yield table, columns, table_data
+
+
+def export_to_xlsx(geoarea, geocode):
+    buffer = io.BytesIO()
+    workbook = xlsxwriter.Workbook(
+        buffer,
+        {"default_date_format": "yyyy-mm-dd", "in_memory": True},
+    )
+
+    for table, columns, table_data in get_export_data(geoarea, geocode):
         if len(table_data) == 0:
             table_data = [[None for _ in columns]]
 
+        worksheet = workbook.add_worksheet(
+            table._meta.db_table.replace("charity_ccew", "")
+        )
         worksheet.add_table(
             0,
             0,
@@ -113,10 +126,30 @@ def export_data(request, geoarea, geocode, filetype="html"):
         worksheet.autofit()
 
     workbook.close()
+    return buffer.getvalue()
+
+
+def export_to_sqlite(geoarea, geocode):
+    # create a temporary file
+    tmpfile = NamedTemporaryFile(suffix=".db", delete=False)
+    tmpfile.close()
+
+    db = Database(tmpfile.name)
+
+    for table, columns, table_data in get_export_data(geoarea, geocode):
+        db_table = db[table._meta.db_table.replace("charity_ccew", "")]
+        db_table.insert_all([dict(zip(columns, row)) for row in table_data])
+    db.close()
+
+    with open(tmpfile.name, "rb") as output_file:
+        data = output_file.read()
+
+    os.unlink(tmpfile.name)
+    return data
+
 
-    response = HttpResponse(
-        buffer.getvalue(),
-        content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+def export_data(request, geoarea, geocode, filetype="xlsx"):
+    s3_file_url = "https://findthatcharity.ams3.cdn.digitaloceanspaces.com/findthatcharity/data/{geoarea}/{geocode}.{filetype}".format(
+        geoarea=geoarea, geocode=geocode, filetype=filetype
     )
-    response["Content-Disposition"] = f"attachment; filename=ccew_export_{geocode}.xlsx"
-    return response
+    return redirect(s3_file_url, permanent=True)
diff --git a/findthatcharity/settings.py b/findthatcharity/settings.py
@@ -322,3 +322,9 @@
 EMAIL_PORT = os.environ.get("EMAIL_PORT")
 EMAIL_USE_SSL = os.environ.get("EMAIL_USE_SSL") == "True"
 EMAIL_USE_SSL = os.environ.get("EMAIL_USE_SSL") == "True"
+
+S3_REGION = os.environ.get("S3_REGION")
+S3_ENDPOINT = os.environ.get("S3_ENDPOINT")
+S3_ACCESS_ID = os.environ.get("S3_ACCESS_ID")
+S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY")
+S3_BUCKET = os.environ.get("S3_BUCKET")
diff --git a/requirements.in b/requirements.in
@@ -45,4 +45,5 @@ ua-parser
 sqlite-utils
 git+https://github.com/kanedata/[email protected]
 certifi
-xlsxwriter
+xlsxwriter
+boto3