Add excel sheet filter to file ingestion efi functions

nrccua · Jun 7, 2023 · 8548d1e · 8548d1e
1 parent b259ad8
commit 8548d1e
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 6 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -3,6 +3,11 @@ History
 =======
 
 
+v0.18.3 (2023-06-07)
+
+* Add excel sheet filter to file ingestion efi functions.
+
+
 v0.18.2 (2023-04-18)
 
 * Add psycopg2 imports inside functions and not at the top of the file.

diff --git a/Makefile b/Makefile
@@ -6,7 +6,7 @@ clean:
 
 install:
 	. env/bin/activate; \
-	pip install cython==0.29.33; \
+	pip install cython==0.29.35; \
 	pip install -r aioradio/requirements.txt
 
 setup:

diff --git a/aioradio/file_ingestion.py b/aioradio/file_ingestion.py
@@ -31,6 +31,7 @@
 from typing import Any, Dict, List, Union
 
 import cchardet as chardet
+import httpx
 import mandrill
 from openpyxl import load_workbook
 from smb.base import SharedFile
@@ -442,6 +443,7 @@ async def xlsx_to_tsv(
         s3_source_key: str,
         s3_destination_bucket: str,
         s3_destination_key: str,
+        fice: str='',
         delimiter: str='\t'
 ) -> Union[str, None]:
     """Convert xlsx file to csv/tsv file.
@@ -451,6 +453,7 @@ async def xlsx_to_tsv(
         s3_source_key (str): source xlsx file s3 key
         s3_destination_bucket (str): destination xlsx file s3 bucket
         s3_destination_key (str): destination xlsx file s3 key
+        fice (str): Institution unique identifier
         delimiter (str, optional): Delimiter. Defaults to '\t'.
 
     Returns:
@@ -460,7 +463,7 @@ async def xlsx_to_tsv(
     try:
         with NamedTemporaryFile(suffix='.xlsx') as tmp:
             await download_file(bucket=s3_source_bucket, filepath=tmp.name, s3_key=s3_source_key)
-            records, _ = xlsx_to_records(tmp)
+            records, _ = xlsx_to_records(fice, tmp)
 
         await tsv_to_s3(records, delimiter, s3_destination_bucket, s3_destination_key)
     except Exception as err:
@@ -474,6 +477,7 @@ async def zipfile_to_tsv(
         s3_source_key: str,
         s3_destination_bucket: str,
         s3_destination_key: str,
+        fice: str='',
         delimiter: str='\t'
 ) -> Union[str, None]:
     """Convert zipfile to csv/tsv file.
@@ -483,6 +487,7 @@ async def zipfile_to_tsv(
         s3_source_key (str): source zipfile s3 key
         s3_destination_bucket (str): destination zipfile s3 bucket
         s3_destination_key (str): destination zipfile s3 key
+        fice (str): Institution unique identifier
         delimiter (str, optional): Delimiter. Defaults to '\t'.
 
     Returns:
@@ -500,7 +505,7 @@ async def zipfile_to_tsv(
             for path in await unzip_file_get_filepaths(tmp.name, tmp_directory, include_extensions=extensions):
                 ext = os.path.splitext(path)[1].lower()
                 if ext == '.xlsx':
-                    records_from_path, header = xlsx_to_records(path, header)
+                    records_from_path, header = xlsx_to_records(fice, path, header)
                     records.extend(records_from_path)
                 else:
                     encoding = detect_encoding(path)
@@ -568,10 +573,11 @@ def tsv_to_records(path: str, encoding: str, delimiter: str, header: str) -> tup
     return records, header
 
 
-def xlsx_to_records(filepath: str, header: Union[str, None]=None) -> tuple:
+def xlsx_to_records(fice: str, filepath: str, header: Union[str, None]=None) -> tuple:
     """Load excel file to records object as list of lists.
 
     Args:
+        fice (str): Institution unique identifier
         filepath (str): Temporary Filepath
         header (Union[str, None], optional): Header. Defaults to None.
 
@@ -582,10 +588,14 @@ def xlsx_to_records(filepath: str, header: Union[str, None]=None) -> tuple:
         tuple: Records as list of lists, header
     """
 
+    excel_sheet_filter = get_efi_excel_sheet_filter()
+
     records = []
     workbook = load_workbook(filepath, read_only=True)
     for sheet in workbook:
-        if sheet.title != 'hiddenSheet':
+        # Make sure excel sheet hasn't been marked to skip for particular fice
+        if fice not in excel_sheet_filter or sheet.title not in excel_sheet_filter[fice]:
+
             sheet.calculate_dimension(force=True)
 
             for idx, row in enumerate(sheet.values):
@@ -663,3 +673,25 @@ def detect_delimiter(path: str, encoding: str) -> str:
                 count = char_count
 
     return delimiter
+
+
+def get_efi_excel_sheet_filter() -> dict[str, set]:
+    """Get the Excel sheet filter from the EFI api.
+
+    Returns:
+        dict[str, set]: Excel sheet filter with fice as key and sheet names as value
+    """
+
+    excel_sheet_filter = {}
+    try:
+        with httpx.Client() as client:
+            resp = client.get(url="http://efi.nrccua-app.org/filter/excel-sheet", timeout=30.0)
+            for fice, item in resp.json()["excel_sheet_filter"].items():
+                excel_sheet_filter[fice] = set()
+                for name, value in item.items():
+                    if value:
+                        excel_sheet_filter[fice].add(name)
+    except Exception:
+        pass
+
+    return excel_sheet_filter
diff --git a/aioradio/tests/file_ingestion_test.py b/aioradio/tests/file_ingestion_test.py
@@ -14,6 +14,7 @@
 from aioradio.file_ingestion import (async_db_wrapper, async_wrapper,
                                      delete_ftp_file, establish_ftp_connection,
                                      get_current_datetime_from_timestamp,
+                                     get_efi_excel_sheet_filter,
                                      list_ftp_objects,
                                      send_emails_via_mandrill,
                                      unzip_file_get_filepaths,
@@ -227,3 +228,11 @@ async def func(**kwargs):
             print(f"Connection name: {name}\tConnection object: {conn}")
 
     await func()
+
+
+def test_get_efi_excel_sheet_filter():
+    """Test get_efi_excel_sheet_filter."""
+
+    excel_sheet_filter = get_efi_excel_sheet_filter()
+    assert '001055' in excel_sheet_filter
+    assert 'hiddensheet' in excel_sheet_filter['001055']
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
     long_description = fileobj.read()
 
 setup(name='aioradio',
-    version='0.18.2',
+    version='0.18.3',
     description='Generic asynchronous i/o python utilities for AWS services (SQS, S3, DynamoDB, Secrets Manager), Redis, MSSQL (pyodbc), JIRA and more',
     long_description=long_description,
     long_description_content_type="text/markdown",