diff --git a/backend/Apple_M1_Dockerfile b/backend/Apple_M1_Dockerfile index 70546c3b5f..dbad68a99e 100644 --- a/backend/Apple_M1_Dockerfile +++ b/backend/Apple_M1_Dockerfile @@ -21,10 +21,13 @@ RUN \ RUN \ apt-get update -yq && \ - apt install curl -y && \ - apt-get install -y gcc && \ - curl -fsSL https://deb.nodesource.com/setup_16.x | bash - && \ - apt-get install -y nodejs && \ + apt install build-essential curl -y && \ + apt-get install -y gcc ca-certificates gnupg && \ + mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \ + NODE_MAJOR=18 && \ + echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \ + apt-get install nodejs -y && \ apt-get install -y npm && \ npm i -g npm@^8 diff --git a/backend/census_historical_migration/README.md b/backend/census_historical_migration/README.md index dc54f9e066..f23a4c79a2 100644 --- a/backend/census_historical_migration/README.md +++ b/backend/census_historical_migration/README.md @@ -1,4 +1,58 @@ -# Census Historical Migration +# Census to FAC data migration + +## Overview + +This is implemented as a Django app to leverage existing management commands and settings. It includes Python and shell scripts to: + +* Load raw census data as CSV files into an S3 bucket +* Create Postgres tables from these CSV files +* Perform any data clean up required to create a table from a CSV file +* Run the historic data migrator +* Run the historic workbook generator + +## Infrastructure changes + +* Create a new S3 bucket in Cloud.gov spaces as well as in the local environment +* Create a new Postgres instance both in CG and locally + +## Utilities + +* fac_s3.py - Uploads folders or files to an S3 bucket. + +```bash +python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_historical_migration/data +``` + +* csv_to_postgres.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. + +```bash +python manage.py csv_to_postgres --folder data --chunksize 10000 +python manage.py csv_to_postgres --clean True +``` + +* models.py These correspond to the incoming CSV files +* routers.py This tells django to use a different postgres instance. + +* data A folder that contains sample data that we can use for development. + +## Prerequisites + +* A Django app that reads the tables created here as unmanaged models and populates SF-SAC tables by creating workbooks, etc. to simulate a real submission + +## How to load test Census data into Postgres + +1. Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_historical_migration/data folder. +NOTE: Never check in the census_historical_migration/data folder into GitHub. + +2. In the FAC/backend folder, run the following to load CSV files from census_historical_migration/data folder into fac-census-to-gsafac-s3 bucket. +```bash +docker compose run web python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_historical_migration/data +``` + +3. In the FAC/backend folder, run the following to read the CSV files from fac-census-to-gsafac-s3 bucket and load into Postgres. +```bash +docker compose run web python manage.py csv_to_postgres --folder data --chunksize 10000 +``` ### How to run the historic data migrator: ``` diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py new file mode 100644 index 0000000000..f9882f5c42 --- /dev/null +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -0,0 +1,139 @@ +import logging +import boto3 +import pandas as pd + + +from io import BytesIO +from botocore.exceptions import ClientError +from django.core.management.base import BaseCommand +from django.conf import settings +from django.apps import apps + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) +census_to_gsafac_models = list( + apps.get_app_config("census_historical_migration").get_models() +) +census_to_gsafac_model_names = [m._meta.model_name for m in census_to_gsafac_models] +s3_client = boto3.client( + "s3", + aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, + endpoint_url=settings.AWS_S3_ENDPOINT_URL, +) +census_to_gsafac_bucket_name = settings.AWS_CENSUS_TO_GSAFAC_BUCKET_NAME +DELIMITER = "," + + +class Command(BaseCommand): + help = """ + Populate Postgres database from csv files + Usage: + manage.py csv_to_postgres --folder --clean + """ + + def add_arguments(self, parser): + parser.add_argument("--folder", help="S3 folder name (required)", type=str) + parser.add_argument( + "--clean", help="Clean the data (default: False)", type=bool, default=False + ) + parser.add_argument( + "--sample", + help="Sample the data (default: False)", + type=bool, + default=False, + ) + parser.add_argument("--load") + parser.add_argument( + "--chunksize", + help="Chunk size for processing data (default: 10_000)", + type=int, + default=10_000, + ) + + def handle(self, *args, **options): + folder = options.get("folder") + if not folder: + print("Please specify a folder name") + return + if options.get("clean"): + self.delete_data() + return + if options.get("sample"): + self.sample_data() + return + chunk_size = options.get("chunksize") + self.process_csv_files(folder, chunk_size) + + def process_csv_files(self, folder, chunk_size): + items = self.list_s3_objects(census_to_gsafac_bucket_name, folder) + for item in items: + if item["Key"].endswith("/"): + continue + model_name = self.get_model_name(item["Key"]) + if model_name: + model_index = census_to_gsafac_model_names.index(model_name) + model_obj = census_to_gsafac_models[model_index] + file = self.get_s3_object( + census_to_gsafac_bucket_name, item["Key"], model_obj + ) + if file: + self.load_data(file, model_obj, chunk_size) + + self.display_row_counts(census_to_gsafac_models) + + def display_row_counts(self, models): + for mdl in models: + row_count = mdl.objects.all().count() + print(f"{row_count} in ", mdl) + + def delete_data(self): + for mdl in census_to_gsafac_models: + print("Deleting ", mdl) + mdl.objects.all().delete() + + def sample_data(self): + for mdl in census_to_gsafac_models: + print("Sampling ", mdl) + rows = mdl.objects.all()[:1] + for row in rows: + for col in mdl._meta.fields: + print(f"{col.name}: {getattr(row, col.name)}") + + def list_s3_objects(self, bucket_name, folder): + return s3_client.list_objects(Bucket=bucket_name, Prefix=folder)["Contents"] + + def get_s3_object(self, bucket_name, key, model_obj): + file = BytesIO() + try: + s3_client.download_fileobj(Bucket=bucket_name, Key=key, Fileobj=file) + except ClientError: + logger.error("Could not download {}".format(model_obj)) + return None + print(f"Obtained {model_obj} from S3") + return file + + def get_model_name(self, name): + print("Processing ", name) + file_name = name.split("/")[-1].split(".")[0] + for model_name in census_to_gsafac_model_names: + if file_name.lower().startswith(model_name): + print("model_name = ", model_name) + return model_name + print("Could not find a matching model for ", name) + return None + + def load_data(self, file, model_obj, chunk_size): + print("Starting load data to postgres") + file.seek(0) + rows_loaded = 0 + for df in pd.read_csv(file, iterator=True, chunksize=chunk_size): + # Each row is a dictionary. The columns are the + # correct names for our model. So, this should be a + # clean way to load the model from a row. + for _, row in df.iterrows(): + obj = model_obj(**row) + obj.save() + rows_loaded += df.shape[0] + print(f"Loaded {rows_loaded} rows in ", model_obj) + return None diff --git a/backend/census_historical_migration/migrations/0001_initial.py b/backend/census_historical_migration/migrations/0001_initial.py new file mode 100644 index 0000000000..a2b2587762 --- /dev/null +++ b/backend/census_historical_migration/migrations/0001_initial.py @@ -0,0 +1,364 @@ +# Generated by Django 4.2.6 on 2023-11-07 18:46 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="ELECAUDITFINDINGS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITFINDINGSID", models.TextField(blank=True, null=True)), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("QCOSTS", models.TextField(blank=True, null=True)), + ("OTHERFINDINGS", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("OTHERNONCOMPLIANCE", models.TextField(blank=True, null=True)), + ("TYPEREQUIREMENT", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("MODIFIEDOPINION", models.TextField(blank=True, null=True)), + ("REPEATFINDING", models.TextField(blank=True, null=True)), + ("PRIORFINDINGREFNUMS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECAUDITHEADER", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITHEADERID", models.TextField(blank=True, null=True)), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("FYENDDATE", models.TextField(blank=True, null=True)), + ("AUDITTYPE", models.TextField(blank=True, null=True)), + ("PERIODCOVERED", models.TextField(blank=True, null=True)), + ("NUMBERMONTHS", models.TextField(blank=True, null=True)), + ("MULTIPLEEINS", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("EINSUBCODE", models.TextField(blank=True, null=True)), + ("MULTIPLEDUNS", models.TextField(blank=True, null=True)), + ("DUNS", models.TextField(blank=True, null=True)), + ("AUDITEENAME", models.TextField(blank=True, null=True)), + ("STREET1", models.TextField(blank=True, null=True)), + ("STREET2", models.TextField(blank=True, null=True)), + ("CITY", models.TextField(blank=True, null=True)), + ("STATE", models.TextField(blank=True, null=True)), + ("ZIPCODE", models.TextField(blank=True, null=True)), + ("AUDITEECONTACT", models.TextField(blank=True, null=True)), + ("AUDITEETITLE", models.TextField(blank=True, null=True)), + ("AUDITEEPHONE", models.TextField(blank=True, null=True)), + ("AUDITEEFAX", models.TextField(blank=True, null=True)), + ("AUDITEEEMAIL", models.TextField(blank=True, null=True)), + ("AUDITEEDATESIGNED", models.TextField(blank=True, null=True)), + ("AUDITEENAMETITLE", models.TextField(blank=True, null=True)), + ("CPAFIRMNAME", models.TextField(blank=True, null=True)), + ("CPASTREET1", models.TextField(blank=True, null=True)), + ("CPASTREET2", models.TextField(blank=True, null=True)), + ("CPACITY", models.TextField(blank=True, null=True)), + ("CPASTATE", models.TextField(blank=True, null=True)), + ("CPAZIPCODE", models.TextField(blank=True, null=True)), + ("CPACONTACT", models.TextField(blank=True, null=True)), + ("CPATITLE", models.TextField(blank=True, null=True)), + ("CPAPHONE", models.TextField(blank=True, null=True)), + ("CPAFAX", models.TextField(blank=True, null=True)), + ("CPAEMAIL", models.TextField(blank=True, null=True)), + ("CPADATESIGNED", models.TextField(blank=True, null=True)), + ("CPANAMETITLE", models.TextField(blank=True, null=True)), + ("COG_OVER", models.TextField(blank=True, null=True)), + ("COGAGENCY", models.TextField(blank=True, null=True)), + ("TYPEREPORT_FS", models.TextField(blank=True, null=True)), + ("REPORTABLECONDITION", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("MATERIALNONCOMPLIANCE", models.TextField(blank=True, null=True)), + ("GOINGCONCERN", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP", models.TextField(blank=True, null=True)), + ("DOLLARTHRESHOLD", models.TextField(blank=True, null=True)), + ("LOWRISK", models.TextField(blank=True, null=True)), + ("REPORTREQUIRED", models.TextField(blank=True, null=True)), + ("TOTFEDEXPEND", models.TextField(blank=True, null=True)), + ("COPIES", models.TextField(blank=True, null=True)), + ("REPORTABLECONDITION_MP", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS_MP", models.TextField(blank=True, null=True)), + ("QCOSTS", models.TextField(blank=True, null=True)), + ("CYFINDINGS", models.TextField(blank=True, null=True)), + ("PYSCHEDULE", models.TextField(blank=True, null=True)), + ("DUP_REPORTS", models.TextField(blank=True, null=True)), + ("COG_AGENCY", models.TextField(blank=True, null=True)), + ("OVERSIGHTAGENCY", models.TextField(blank=True, null=True)), + ("DATERECEIVED", models.TextField(blank=True, null=True)), + ("DATEFIREWALL", models.TextField(blank=True, null=True)), + ("PREVIOUSDATEFIREWALL", models.TextField(blank=True, null=True)), + ("FINDINGREFNUM", models.TextField(blank=True, null=True)), + ("TYPEOFENTITY", models.TextField(blank=True, null=True)), + ("IMAGE", models.TextField(blank=True, null=True)), + ("AGENCYCFDA", models.TextField(blank=True, null=True)), + ("INITIALDATE", models.TextField(blank=True, null=True)), + ("DATERECEIVEDOTHER", models.TextField(blank=True, null=True)), + ("MULTIPLE_CPAS", models.TextField(blank=True, null=True)), + ("AUDITEECERTIFYNAME", models.TextField(blank=True, null=True)), + ("AUDITEECERTIFYTITLE", models.TextField(blank=True, null=True)), + ("FACACCEPTEDDATE", models.TextField(blank=True, null=True)), + ("AUDITOR_EIN", models.TextField(blank=True, null=True)), + ("SD_MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("SD_MATERIALWEAKNESS_MP", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY_MP", models.TextField(blank=True, null=True)), + ("SP_FRAMEWORK", models.TextField(blank=True, null=True)), + ("SP_FRAMEWORK_REQUIRED", models.TextField(blank=True, null=True)), + ("TYPEREPORT_SP_FRAMEWORK", models.TextField(blank=True, null=True)), + ("SUPPRESSION_CODE", models.TextField(blank=True, null=True)), + ("ENTITY_TYPE", models.TextField(blank=True, null=True)), + ("TYPEAUDIT_CODE", models.TextField(blank=True, null=True)), + ("OPEID", models.TextField(blank=True, null=True)), + ("DATETOED", models.TextField(blank=True, null=True)), + ("DATEFINISHED", models.TextField(blank=True, null=True)), + ("TYPEFINDING", models.TextField(blank=True, null=True)), + ("TYPEFUNDING", models.TextField(blank=True, null=True)), + ("FYSTARTDATE", models.TextField(blank=True, null=True)), + ("CPAFOREIGN", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ("CPACOUNTRY", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECAUDITS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("CFDASEQNUM", models.TextField(blank=True, null=True)), + ("CFDA", models.TextField(blank=True, null=True)), + ("FEDERALPROGRAMNAME", models.TextField(blank=True, null=True)), + ("AMOUNT", models.TextField(blank=True, null=True)), + ("MAJORPROGRAM", models.TextField(blank=True, null=True)), + ("TYPEREQUIREMENT", models.TextField(blank=True, null=True)), + ("QCOSTS2", models.TextField(blank=True, null=True)), + ("FINDINGS", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("RD", models.TextField(blank=True, null=True)), + ("DIRECT", models.TextField(blank=True, null=True)), + ("CFDA_PREFIX", models.TextField(blank=True, null=True)), + ("CFDA_EXT", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("CFDA2", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP_OVERRIDE", models.TextField(blank=True, null=True)), + ("ARRA", models.TextField(blank=True, null=True)), + ("LOANS", models.TextField(blank=True, null=True)), + ("FINDINGSCOUNT", models.TextField(blank=True, null=True)), + ("LOANBALANCE", models.TextField(blank=True, null=True)), + ("PASSTHROUGHAMOUNT", models.TextField(blank=True, null=True)), + ("AWARDIDENTIFICATION", models.TextField(blank=True, null=True)), + ("CLUSTERNAME", models.TextField(blank=True, null=True)), + ("PASSTHROUGHAWARD", models.TextField(blank=True, null=True)), + ("STATECLUSTERNAME", models.TextField(blank=True, null=True)), + ("PROGRAMTOTAL", models.TextField(blank=True, null=True)), + ("CLUSTERTOTAL", models.TextField(blank=True, null=True)), + ("OTHERCLUSTERNAME", models.TextField(blank=True, null=True)), + ("CFDAPROGRAMNAME", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECCAPTEXT", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("TEXT", models.TextField(blank=True, null=True)), + ("CHARTSTABLES", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECCPAS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("SEQNUM", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("CPAFIRMNAME", models.TextField(blank=True, null=True)), + ("CPASTREET1", models.TextField(blank=True, null=True)), + ("CPACITY", models.TextField(blank=True, null=True)), + ("CPASTATE", models.TextField(blank=True, null=True)), + ("CPAZIPCODE", models.TextField(blank=True, null=True)), + ("CPACONTACT", models.TextField(blank=True, null=True)), + ("CPATITLE", models.TextField(blank=True, null=True)), + ("CPAPHONE", models.TextField(blank=True, null=True)), + ("CPAFAX", models.TextField(blank=True, null=True)), + ("CPAEMAIL", models.TextField(blank=True, null=True)), + ("CPAEIN", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECEINS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("EINSEQNUM", models.TextField(blank=True, null=True)), + ("DUNS", models.TextField(blank=True, null=True)), + ("DUNSEQNUM", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECFINDINGSTEXT", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("TEXT", models.TextField(blank=True, null=True)), + ("CHARTSTABLES", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECNOTES", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("TYPE_ID", models.TextField(blank=True, null=True)), + ("NOTE_INDEX", models.TextField(blank=True, null=True)), + ("TITLE", models.TextField(blank=True, null=True)), + ("CONTENT", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECPASSTHROUGH", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("PASSTHROUGHNAME", models.TextField(blank=True, null=True)), + ("PASSTHROUGHID", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECUEIS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("UEISID", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("SEQNUM", models.TextField(blank=True, null=True)), + ], + ), + ] diff --git a/backend/census_historical_migration/models.py b/backend/census_historical_migration/models.py index af3844168d..503a9e027f 100644 --- a/backend/census_historical_migration/models.py +++ b/backend/census_historical_migration/models.py @@ -1,3 +1,445 @@ -from django.db import models # noqa: F401 - -# Create your models here. +from django.db import models + + +class ELECAUDITHEADER(models.Model): + ELECAUDITHEADERID = models.TextField(blank=True, null=True) + + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + FYENDDATE = models.TextField(blank=True, null=True) + + AUDITTYPE = models.TextField(blank=True, null=True) + + PERIODCOVERED = models.TextField(blank=True, null=True) + + NUMBERMONTHS = models.TextField(blank=True, null=True) + + MULTIPLEEINS = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + EINSUBCODE = models.TextField(blank=True, null=True) + + MULTIPLEDUNS = models.TextField(blank=True, null=True) + + DUNS = models.TextField(blank=True, null=True) + + AUDITEENAME = models.TextField(blank=True, null=True) + + STREET1 = models.TextField(blank=True, null=True) + + STREET2 = models.TextField(blank=True, null=True) + + CITY = models.TextField(blank=True, null=True) + + STATE = models.TextField(blank=True, null=True) + + ZIPCODE = models.TextField(blank=True, null=True) + + AUDITEECONTACT = models.TextField(blank=True, null=True) + + AUDITEETITLE = models.TextField(blank=True, null=True) + + AUDITEEPHONE = models.TextField(blank=True, null=True) + + AUDITEEFAX = models.TextField(blank=True, null=True) + + AUDITEEEMAIL = models.TextField(blank=True, null=True) + + AUDITEEDATESIGNED = models.TextField(blank=True, null=True) + + AUDITEENAMETITLE = models.TextField(blank=True, null=True) + + CPAFIRMNAME = models.TextField(blank=True, null=True) + + CPASTREET1 = models.TextField(blank=True, null=True) + + CPASTREET2 = models.TextField(blank=True, null=True) + + CPACITY = models.TextField(blank=True, null=True) + + CPASTATE = models.TextField(blank=True, null=True) + + CPAZIPCODE = models.TextField(blank=True, null=True) + + CPACONTACT = models.TextField(blank=True, null=True) + + CPATITLE = models.TextField(blank=True, null=True) + + CPAPHONE = models.TextField(blank=True, null=True) + + CPAFAX = models.TextField(blank=True, null=True) + + CPAEMAIL = models.TextField(blank=True, null=True) + + CPADATESIGNED = models.TextField(blank=True, null=True) + + CPANAMETITLE = models.TextField(blank=True, null=True) + + COG_OVER = models.TextField(blank=True, null=True) + + COGAGENCY = models.TextField(blank=True, null=True) + + TYPEREPORT_FS = models.TextField(blank=True, null=True) + + REPORTABLECONDITION = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + MATERIALNONCOMPLIANCE = models.TextField(blank=True, null=True) + + GOINGCONCERN = models.TextField(blank=True, null=True) + + TYPEREPORT_MP = models.TextField(blank=True, null=True) + + DOLLARTHRESHOLD = models.TextField(blank=True, null=True) + + LOWRISK = models.TextField(blank=True, null=True) + + REPORTREQUIRED = models.TextField(blank=True, null=True) + + TOTFEDEXPEND = models.TextField(blank=True, null=True) + + COPIES = models.TextField(blank=True, null=True) + + REPORTABLECONDITION_MP = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS_MP = models.TextField(blank=True, null=True) + + QCOSTS = models.TextField(blank=True, null=True) + + CYFINDINGS = models.TextField(blank=True, null=True) + + PYSCHEDULE = models.TextField(blank=True, null=True) + + DUP_REPORTS = models.TextField(blank=True, null=True) + + COG_AGENCY = models.TextField(blank=True, null=True) + + OVERSIGHTAGENCY = models.TextField(blank=True, null=True) + + DATERECEIVED = models.TextField(blank=True, null=True) + + DATEFIREWALL = models.TextField(blank=True, null=True) + + PREVIOUSDATEFIREWALL = models.TextField(blank=True, null=True) + + FINDINGREFNUM = models.TextField(blank=True, null=True) + + TYPEOFENTITY = models.TextField(blank=True, null=True) + + IMAGE = models.TextField(blank=True, null=True) + + AGENCYCFDA = models.TextField(blank=True, null=True) + + INITIALDATE = models.TextField(blank=True, null=True) + + DATERECEIVEDOTHER = models.TextField(blank=True, null=True) + + MULTIPLE_CPAS = models.TextField(blank=True, null=True) + + AUDITEECERTIFYNAME = models.TextField(blank=True, null=True) + + AUDITEECERTIFYTITLE = models.TextField(blank=True, null=True) + + FACACCEPTEDDATE = models.TextField(blank=True, null=True) + + AUDITOR_EIN = models.TextField(blank=True, null=True) + + SD_MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + SD_MATERIALWEAKNESS_MP = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY_MP = models.TextField(blank=True, null=True) + + SP_FRAMEWORK = models.TextField(blank=True, null=True) + + SP_FRAMEWORK_REQUIRED = models.TextField(blank=True, null=True) + + TYPEREPORT_SP_FRAMEWORK = models.TextField(blank=True, null=True) + + SUPPRESSION_CODE = models.TextField(blank=True, null=True) + + ENTITY_TYPE = models.TextField(blank=True, null=True) + + TYPEAUDIT_CODE = models.TextField(blank=True, null=True) + + OPEID = models.TextField(blank=True, null=True) + + DATETOED = models.TextField(blank=True, null=True) + + DATEFINISHED = models.TextField(blank=True, null=True) + + TYPEFINDING = models.TextField(blank=True, null=True) + + TYPEFUNDING = models.TextField(blank=True, null=True) + + FYSTARTDATE = models.TextField(blank=True, null=True) + + CPAFOREIGN = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + CPACOUNTRY = models.TextField(blank=True, null=True) + + +class ELECEINS(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + EINSEQNUM = models.TextField(blank=True, null=True) + + DUNS = models.TextField(blank=True, null=True) + + DUNSEQNUM = models.TextField(blank=True, null=True) + + +class ELECAUDITFINDINGS(models.Model): + ELECAUDITFINDINGSID = models.TextField(blank=True, null=True) + + ELECAUDITSID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + QCOSTS = models.TextField(blank=True, null=True) + + OTHERFINDINGS = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + OTHERNONCOMPLIANCE = models.TextField(blank=True, null=True) + + TYPEREQUIREMENT = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + MODIFIEDOPINION = models.TextField(blank=True, null=True) + + REPEATFINDING = models.TextField(blank=True, null=True) + + PRIORFINDINGREFNUMS = models.TextField(blank=True, null=True) + + +class ELECNOTES(models.Model): + ID = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + SEQ_NUMBER = models.TextField(blank=True, null=True) + + TYPE_ID = models.TextField(blank=True, null=True) + + NOTE_INDEX = models.TextField(blank=True, null=True) + + TITLE = models.TextField(blank=True, null=True) + + CONTENT = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + +class ELECFINDINGSTEXT(models.Model): + SEQ_NUMBER = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + TEXT = models.TextField(blank=True, null=True) + + CHARTSTABLES = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + +class ELECCPAS(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + SEQNUM = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + CPAFIRMNAME = models.TextField(blank=True, null=True) + + CPASTREET1 = models.TextField(blank=True, null=True) + + CPACITY = models.TextField(blank=True, null=True) + + CPASTATE = models.TextField(blank=True, null=True) + + CPAZIPCODE = models.TextField(blank=True, null=True) + + CPACONTACT = models.TextField(blank=True, null=True) + + CPATITLE = models.TextField(blank=True, null=True) + + CPAPHONE = models.TextField(blank=True, null=True) + + CPAFAX = models.TextField(blank=True, null=True) + + CPAEMAIL = models.TextField(blank=True, null=True) + + CPAEIN = models.TextField(blank=True, null=True) + + +class ELECAUDITS(models.Model): + ELECAUDITSID = models.TextField(blank=True, null=True) + + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + CFDASEQNUM = models.TextField(blank=True, null=True) + + CFDA = models.TextField(blank=True, null=True) + + FEDERALPROGRAMNAME = models.TextField(blank=True, null=True) + + AMOUNT = models.TextField(blank=True, null=True) + + MAJORPROGRAM = models.TextField(blank=True, null=True) + + TYPEREQUIREMENT = models.TextField(blank=True, null=True) + + QCOSTS2 = models.TextField(blank=True, null=True) + + FINDINGS = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + RD = models.TextField(blank=True, null=True) + + DIRECT = models.TextField(blank=True, null=True) + + CFDA_PREFIX = models.TextField(blank=True, null=True) + + CFDA_EXT = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + CFDA2 = models.TextField(blank=True, null=True) + + TYPEREPORT_MP = models.TextField(blank=True, null=True) + + TYPEREPORT_MP_OVERRIDE = models.TextField(blank=True, null=True) + + ARRA = models.TextField(blank=True, null=True) + + LOANS = models.TextField(blank=True, null=True) + + FINDINGSCOUNT = models.TextField(blank=True, null=True) + + LOANBALANCE = models.TextField(blank=True, null=True) + + PASSTHROUGHAMOUNT = models.TextField(blank=True, null=True) + + AWARDIDENTIFICATION = models.TextField(blank=True, null=True) + + CLUSTERNAME = models.TextField(blank=True, null=True) + + PASSTHROUGHAWARD = models.TextField(blank=True, null=True) + + STATECLUSTERNAME = models.TextField(blank=True, null=True) + + PROGRAMTOTAL = models.TextField(blank=True, null=True) + + CLUSTERTOTAL = models.TextField(blank=True, null=True) + + OTHERCLUSTERNAME = models.TextField(blank=True, null=True) + + CFDAPROGRAMNAME = models.TextField(blank=True, null=True) + + +class ELECPASSTHROUGH(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + ELECAUDITSID = models.TextField(blank=True, null=True) + + PASSTHROUGHNAME = models.TextField(blank=True, null=True) + + PASSTHROUGHID = models.TextField(blank=True, null=True) + + +class ELECUEIS(models.Model): + UEISID = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + SEQNUM = models.TextField(blank=True, null=True) + + +class ELECCAPTEXT(models.Model): + SEQ_NUMBER = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + TEXT = models.TextField(blank=True, null=True) + + CHARTSTABLES = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) diff --git a/backend/census_historical_migration/routers.py b/backend/census_historical_migration/routers.py new file mode 100644 index 0000000000..4500e28dcf --- /dev/null +++ b/backend/census_historical_migration/routers.py @@ -0,0 +1,17 @@ +app_name = "census_to_gsafac" +db_name = "census-to-gsafac-db" + + +class DBRouter: + def db_for_read(self, model, **hints): + if model._meta.app_label == app_name: + return db_name + return None + + def db_for_write(self, model, **hints): + return self.db_for_read(model, hints) + + def allow_migrate(self, db, app_label, model_name=None, **hints): + if app_label == app_name: + return db == db_name + return False diff --git a/backend/config/settings.py b/backend/config/settings.py index 9b5199a065..b2bc4f55ac 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -171,6 +171,10 @@ "default": env.dj_db_url( "DATABASE_URL", default="postgres://postgres:password@0.0.0.0/backend" ), + "census-to-gsafac-db": env.dj_db_url( + "DATABASE_URL_CENSUS_TO_GSAFAC_DB", + default="postgres://postgres:password@0.0.0.0/census-to-gsafac-db", + ), } POSTGREST = { @@ -244,6 +248,9 @@ # Private bucket AWS_PRIVATE_STORAGE_BUCKET_NAME = "gsa-fac-private-s3" + # Private CENSUS_TO_GSAFAC bucket + AWS_CENSUS_TO_GSAFAC_BUCKET_NAME = "fac-census-to-gsafac-s3" + AWS_S3_PRIVATE_REGION_NAME = os.environ.get( "AWS_S3_PRIVATE_REGION_NAME", "us-east-1" ) diff --git a/backend/docker-compose-web.yml b/backend/docker-compose-web.yml index 757281f19b..d8e89ffd35 100644 --- a/backend/docker-compose-web.yml +++ b/backend/docker-compose-web.yml @@ -16,6 +16,20 @@ services: timeout: 5s retries: 10 + census-to-gsafac-db: + image: "postgres:12" + environment: + POSTGRES_HOST_AUTH_METHOD: "trust" + volumes: + - census-to-gsafac-data:/var/lib/postgresql/data/ + ports: + - "5433:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -d postgres -U postgres -p 5433"] + interval: 10s + timeout: 5s + retries: 10 + web: image: ghcr.io/gsa-tts/fac/web-container:latest command: /src/run.sh @@ -83,4 +97,5 @@ services: volumes: postgres-data: + census-to-gsafac-data: minio-vol: diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml index b9f48eca94..b66375f91c 100644 --- a/backend/docker-compose.yml +++ b/backend/docker-compose.yml @@ -18,6 +18,20 @@ services: timeout: 5s retries: 10 + census-to-gsafac-db: + image: "postgres:12" + environment: + POSTGRES_HOST_AUTH_METHOD: trust + volumes: + - census-to-gsafac-data:/var/lib/postgresql/data/ + ports: + - "5433:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -d postgres -U postgres -p 5433"] + interval: 10s + timeout: 5s + retries: 10 + #--------------------------------------------- # Historic data #--------------------------------------------- @@ -116,4 +130,5 @@ services: condition: service_healthy volumes: postgres-data: + census-to-gsafac-data: minio-vol: diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 672c354c11..72e4c856a4 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -7,7 +7,7 @@ exclude = "node_modules/" [tool.mypy] ignore_missing_imports = true explicit_package_bases = true -exclude = ".venv/|audit/migrations/|dissemination/migrations" +exclude = ".venv/|audit/migrations/|dissemination/migrations|census_historical_migration/migrations" [tool.pylint."MESSAGES CONTROL"] # Tadhg 2022-05-03: I added the below because I've found them unhelpful. diff --git a/backend/run.sh b/backend/run.sh index a22b47ef35..6e2b3e0911 100755 --- a/backend/run.sh +++ b/backend/run.sh @@ -14,6 +14,7 @@ if [[ "${ENV}" == "LOCAL" || "${ENV}" == "TESTING" ]]; then export AWS_S3_PRIVATE_ENDPOINT="http://minio:9000" mc alias set myminio "${AWS_S3_PRIVATE_ENDPOINT}" minioadmin minioadmin mc mb myminio/gsa-fac-private-s3 + mc mb myminio/fac-census-to-gsafac-s3 mc admin user svcacct add --access-key="${AWS_PRIVATE_ACCESS_KEY_ID}" --secret-key="${AWS_PRIVATE_SECRET_ACCESS_KEY}" myminio minioadmin fi; diff --git a/backend/support/management/commands/fac_s3.py b/backend/support/management/commands/fac_s3.py new file mode 100644 index 0000000000..9884d5b5f6 --- /dev/null +++ b/backend/support/management/commands/fac_s3.py @@ -0,0 +1,81 @@ +from os import path +import os + +import boto3 + +from django.core.management.base import BaseCommand + +from django.conf import settings + + +class Command(BaseCommand): + help = """ + Alternative to aws s3 as the cli is not available in production. + Usage: + manage.py fac_s3 --upload --src SRC [--tgt TGT] + manage.py fac_s3 --download --src SRC [--tgt TGT] + manage.py fac_s3 --rm --tgt TGT] + manage.py fac_s3 --ls [--tgt TGT] + """ + + def add_arguments(self, parser): + parser.add_argument("bucket_name", type=str, help="The S3 bucket name.") + parser.add_argument("--src", help="local file name.") + parser.add_argument("--tgt", help="s3 file name.") + parser.add_argument("--ls", action="store_true", help="List all files.") + parser.add_argument( + "--upload", action="store_true", help="Copy local src to S3 tgt." + ) + parser.add_argument( + "--download", action="store_true", help="Copy S3 tgt to local src." + ) + parser.add_argument("--rm", action="store_true", help="Delete tgt.") + + def handle(self, *args, **options): + bucket_name = options["bucket_name"] + src_path = options["src"] + tgt_path = options["tgt"] + + s3_client = boto3.client( + "s3", + aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, + endpoint_url=settings.AWS_S3_ENDPOINT_URL, + ) + + if options["ls"]: + items = s3_client.list_objects( + Bucket=bucket_name, + Prefix=tgt_path or "", + ).get("Contents") + if not items: + print("Target is empty") + return + for item in items: + print(item["Key"], item["Size"], item["LastModified"]) + return + + if options["upload"]: + file_path = path.join(settings.BASE_DIR, src_path) + tgt_name = tgt_path or os.path.basename(file_path) + tgt_name_offset = len(str(file_path)) + for subdir, dir, files in os.walk(file_path): + object_name = tgt_name + str(subdir)[tgt_name_offset:] + "/" + print(subdir, dir, object_name, files) + for file in files: + full_path = os.path.join(subdir, file) + s3_client.upload_file(full_path, bucket_name, object_name + file) + print(f"Copied {full_path} to {bucket_name} {object_name+file}.") + return + + if options["download"]: + file_path = path.join(settings.BASE_DIR, src_path) + object_name = tgt_path + s3_client.download_file(bucket_name, object_name, file_path) + return + + if options["rm"]: + s3_client.delete_object( + Bucket=bucket_name, + Key=tgt_path, + )