2895: New cmd for paginated census migration (#2905)

* Creating paginated command * Updating readme * Lint * Using django paginator * Using page_size instead of batchSize * Readme tweak * Readme tweak * Update backend/census_historical_migration/README.md --------- Co-authored-by: Phil Dominguez <“[email protected]”> Co-authored-by: Hassan D. M. Sambo <[email protected]>
GSA-TTS · Dec 1, 2023 · 5e36aea · 5e36aea
1 parent de7c759
commit 5e36aea
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 24 deletions.
diff --git a/backend/census_historical_migration/README.md b/backend/census_historical_migration/README.md
@@ -41,7 +41,7 @@ python manage.py csv_to_postgres --clean True
 
 ## How to load test Census data into Postgres
 
-1.  Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_historical_migration/data folder.  
+1.  Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_historical_migration/data folder.
 NOTE:  Never check in the census_historical_migration/data folder into GitHub.
 
 2.  In the FAC/backend folder, run the following to load CSV files from census_historical_migration/data folder into fac-census-to-gsafac-s3 bucket.
@@ -55,13 +55,23 @@ docker compose run --rm web python manage.py csv_to_postgres --folder data --chu
 ```
 
 ### How to run the historic data migrator:
+To migrate individual dbkeys:
 ```
 docker compose run --rm web python manage.py historic_data_migrator
   --years 22 \
   --dbkeys 177310
 ```
 - `year` and `dbkey` are optional. The script will use default values for these if they aren't provided.
 
+To migrate dbkeys for a given year with pagination:
+```
+docker compose run --rm web python manage.py run_paginated_migration
+  --year 2022 \
+  --page_size 1000
+  --pages 1, 3, 4
+```
+- `batchSize` and `pages` are optional. The script will use default values for these if they aren't provided.
+
 ### How to run the historic workbook generator:
 ```
 docker compose run --rm web python manage.py historic_workbook_generator \
@@ -74,6 +84,6 @@ docker compose run --rm web python manage.py historic_workbook_generator \
 
 ### How to trigger historic data migrator from GitHub:
 - Go to GitHub Actions and select `Historic data migrator` action
-- Next, click on `Run workflow` on top right and 
+- Next, click on `Run workflow` on top right and
 - Provide the target `environment` along with optional parameters such as `dbkeys` and `years`
 - Click `Run`
diff --git a/backend/census_historical_migration/historic_data_loader.py b/backend/census_historical_migration/historic_data_loader.py
@@ -2,36 +2,49 @@
 from .workbooklib.end_to_end_core import run_end_to_end
 
 from django.contrib.auth import get_user_model
+from django.core.paginator import Paginator
+
 
 User = get_user_model()
 
 
-def load_historic_data_for_year(audit_year):
+def load_historic_data_for_year(audit_year, page_size, pages):
     """Iterates over and processes submissions for the given audit year"""
     result_log = {}
     total_count = error_count = 0
     user = create_or_get_user()
-    submissions_for_year = Gen.objects.filter(AUDITYEAR=audit_year)
-
-    for submission in submissions_for_year:
-        dbkey = submission.DBKEY
-        result = {"success": [], "errors": []}
-
-        try:
-            # Migrate a single submission
-            run_end_to_end(user, dbkey, audit_year, result)
-        except Exception as exc:
-            result["errors"].append(f"{exc}")
-
-        result_log[(audit_year, dbkey)] = result
-        total_count += 1
-
-        if len(result["errors"]) > 0:
-            error_count += 1
-        if total_count % 5 == 0:
-            print(f"Processed = {total_count}, Errors = {error_count}")
-        if error_count > 5:
-            break
+    submissions_for_year = Gen.objects.filter(AUDITYEAR=audit_year).order_by(
+        "ELECAUDITHEADERID"
+    )
+    paginator = Paginator(submissions_for_year, page_size)
+
+    print(f"{submissions_for_year.count()} submissions found for {audit_year}")
+
+    for page_number in pages:
+        page = paginator.page(page_number)
+        print(
+            f"Processing page {page_number} with {page.object_list.count()} submissions."
+        )
+
+        for submission in page.object_list:
+            dbkey = submission.DBKEY
+            result = {"success": [], "errors": []}
+
+            try:
+                # Migrate a single submission
+                run_end_to_end(user, dbkey, audit_year, result)
+            except Exception as exc:
+                result["errors"].append(f"{exc}")
+
+            result_log[(audit_year, dbkey)] = result
+            total_count += 1
+
+            if len(result["errors"]) > 0:
+                error_count += 1
+            if total_count % 5 == 0:
+                print(f"Processed = {total_count}, Errors = {error_count}")
+            if error_count > 5:
+                break
 
     print("********* Loader Summary ***************")
 

diff --git a/backend/census_historical_migration/management/commands/run_paginated_migration.py b/backend/census_historical_migration/management/commands/run_paginated_migration.py
@@ -0,0 +1,41 @@
+from ...historic_data_loader import load_historic_data_for_year
+
+from django.core.management.base import BaseCommand
+
+import logging
+import sys
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
+
+class Command(BaseCommand):
+    help = """
+        Migrate from Census tables to GSAFAC tables for a given year using pagination
+        Usage:
+        manage.py run_migration
+            --year <audit year>
+            --pageSize <page size>
+            --pages <comma separated pages>
+    """
+
+    def add_arguments(self, parser):
+        parser.add_argument("--year", help="4-digit Audit Year")
+        parser.add_argument("--page_size", type=int, required=False, default=5)
+        parser.add_argument("--pages", type=str, required=False, default="1")
+
+    def handle(self, *args, **options):
+        year = options.get("year")
+        if not year:
+            print("Please specify an audit year")
+            return
+
+        try:
+            pages_str = options["pages"]
+            pages = list(map(lambda d: int(d), pages_str.split(",")))
+        except ValueError:
+            logger.error(f"Found a non-integer in pages '{pages_str}'")
+            sys.exit(-1)
+
+        load_historic_data_for_year(year, options["page_size"], pages)