Skip to content

Commit

Permalink
Add excel sheet filter to file ingestion efi functions
Browse files Browse the repository at this point in the history
  • Loading branch information
tim.reichard committed Jun 7, 2023
1 parent b259ad8 commit 8548d1e
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 6 deletions.
5 changes: 5 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ History
=======


v0.18.3 (2023-06-07)

* Add excel sheet filter to file ingestion efi functions.


v0.18.2 (2023-04-18)

* Add psycopg2 imports inside functions and not at the top of the file.
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ clean:

install:
. env/bin/activate; \
pip install cython==0.29.33; \
pip install cython==0.29.35; \
pip install -r aioradio/requirements.txt

setup:
Expand Down
40 changes: 36 additions & 4 deletions aioradio/file_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from typing import Any, Dict, List, Union

import cchardet as chardet
import httpx
import mandrill
from openpyxl import load_workbook
from smb.base import SharedFile
Expand Down Expand Up @@ -442,6 +443,7 @@ async def xlsx_to_tsv(
s3_source_key: str,
s3_destination_bucket: str,
s3_destination_key: str,
fice: str='',
delimiter: str='\t'
) -> Union[str, None]:
"""Convert xlsx file to csv/tsv file.
Expand All @@ -451,6 +453,7 @@ async def xlsx_to_tsv(
s3_source_key (str): source xlsx file s3 key
s3_destination_bucket (str): destination xlsx file s3 bucket
s3_destination_key (str): destination xlsx file s3 key
fice (str): Institution unique identifier
delimiter (str, optional): Delimiter. Defaults to '\t'.
Returns:
Expand All @@ -460,7 +463,7 @@ async def xlsx_to_tsv(
try:
with NamedTemporaryFile(suffix='.xlsx') as tmp:
await download_file(bucket=s3_source_bucket, filepath=tmp.name, s3_key=s3_source_key)
records, _ = xlsx_to_records(tmp)
records, _ = xlsx_to_records(fice, tmp)

await tsv_to_s3(records, delimiter, s3_destination_bucket, s3_destination_key)
except Exception as err:
Expand All @@ -474,6 +477,7 @@ async def zipfile_to_tsv(
s3_source_key: str,
s3_destination_bucket: str,
s3_destination_key: str,
fice: str='',
delimiter: str='\t'
) -> Union[str, None]:
"""Convert zipfile to csv/tsv file.
Expand All @@ -483,6 +487,7 @@ async def zipfile_to_tsv(
s3_source_key (str): source zipfile s3 key
s3_destination_bucket (str): destination zipfile s3 bucket
s3_destination_key (str): destination zipfile s3 key
fice (str): Institution unique identifier
delimiter (str, optional): Delimiter. Defaults to '\t'.
Returns:
Expand All @@ -500,7 +505,7 @@ async def zipfile_to_tsv(
for path in await unzip_file_get_filepaths(tmp.name, tmp_directory, include_extensions=extensions):
ext = os.path.splitext(path)[1].lower()
if ext == '.xlsx':
records_from_path, header = xlsx_to_records(path, header)
records_from_path, header = xlsx_to_records(fice, path, header)
records.extend(records_from_path)
else:
encoding = detect_encoding(path)
Expand Down Expand Up @@ -568,10 +573,11 @@ def tsv_to_records(path: str, encoding: str, delimiter: str, header: str) -> tup
return records, header


def xlsx_to_records(filepath: str, header: Union[str, None]=None) -> tuple:
def xlsx_to_records(fice: str, filepath: str, header: Union[str, None]=None) -> tuple:
"""Load excel file to records object as list of lists.
Args:
fice (str): Institution unique identifier
filepath (str): Temporary Filepath
header (Union[str, None], optional): Header. Defaults to None.
Expand All @@ -582,10 +588,14 @@ def xlsx_to_records(filepath: str, header: Union[str, None]=None) -> tuple:
tuple: Records as list of lists, header
"""

excel_sheet_filter = get_efi_excel_sheet_filter()

records = []
workbook = load_workbook(filepath, read_only=True)
for sheet in workbook:
if sheet.title != 'hiddenSheet':
# Make sure excel sheet hasn't been marked to skip for particular fice
if fice not in excel_sheet_filter or sheet.title not in excel_sheet_filter[fice]:

sheet.calculate_dimension(force=True)

for idx, row in enumerate(sheet.values):
Expand Down Expand Up @@ -663,3 +673,25 @@ def detect_delimiter(path: str, encoding: str) -> str:
count = char_count

return delimiter


def get_efi_excel_sheet_filter() -> dict[str, set]:
"""Get the Excel sheet filter from the EFI api.
Returns:
dict[str, set]: Excel sheet filter with fice as key and sheet names as value
"""

excel_sheet_filter = {}
try:
with httpx.Client() as client:
resp = client.get(url="http://efi.nrccua-app.org/filter/excel-sheet", timeout=30.0)
for fice, item in resp.json()["excel_sheet_filter"].items():
excel_sheet_filter[fice] = set()
for name, value in item.items():
if value:
excel_sheet_filter[fice].add(name)
except Exception:
pass

return excel_sheet_filter
9 changes: 9 additions & 0 deletions aioradio/tests/file_ingestion_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from aioradio.file_ingestion import (async_db_wrapper, async_wrapper,
delete_ftp_file, establish_ftp_connection,
get_current_datetime_from_timestamp,
get_efi_excel_sheet_filter,
list_ftp_objects,
send_emails_via_mandrill,
unzip_file_get_filepaths,
Expand Down Expand Up @@ -227,3 +228,11 @@ async def func(**kwargs):
print(f"Connection name: {name}\tConnection object: {conn}")

await func()


def test_get_efi_excel_sheet_filter():
"""Test get_efi_excel_sheet_filter."""

excel_sheet_filter = get_efi_excel_sheet_filter()
assert '001055' in excel_sheet_filter
assert 'hiddensheet' in excel_sheet_filter['001055']
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
long_description = fileobj.read()

setup(name='aioradio',
version='0.18.2',
version='0.18.3',
description='Generic asynchronous i/o python utilities for AWS services (SQS, S3, DynamoDB, Secrets Manager), Redis, MSSQL (pyodbc), JIRA and more',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down

0 comments on commit 8548d1e

Please sign in to comment.