Skip to content

Commit

Permalink
Merge pull request #84 from nrccua/DS-309-update-xlsx-to-tsv-func-to-…
Browse files Browse the repository at this point in the history
…use-s3

Update xlsx_to_tsv function to use s3
  • Loading branch information
nrccua-timr authored Oct 28, 2022
2 parents a59a225 + a2d7265 commit 7e5f0ba
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 24 deletions.
5 changes: 5 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ History
=======


v0.17.19 (2022-10-28)

* Update xlsx_to_tsv function to use s3 instead of local directories for origin/destination files.


v0.17.18 (2022-10-21)

* Add a function in file_ingestion to convert an xlsx file to csv/tsv.
Expand Down
60 changes: 37 additions & 23 deletions aioradio/file_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from dataclasses import field as dc_field
from datetime import datetime, timedelta, timezone, tzinfo
from pathlib import Path
from tempfile import NamedTemporaryFile
from types import coroutine
from typing import Any, Dict, List

Expand All @@ -38,6 +39,7 @@
from smb.smb_structs import OperationFailure
from smb.SMBConnection import SMBConnection

from aioradio.aws.s3 import download_file, upload_file
from aioradio.aws.secrets import get_secret
from aioradio.psycopg2 import establish_psycopg2_connection

Expand Down Expand Up @@ -1620,12 +1622,20 @@ async def get_ftp_file_attributes(conn: SMBConnection, service_name: str, ftp_pa
return conn.getAttributes(service_name=service_name, path=ftp_path)


def xlsx_to_tsv(source: str, destination: str, delimiter: str='\t') -> str | None:
async def xlsx_to_tsv(
s3_source_bucket: str,
s3_source_key: str,
s3_destination_bucket: str,
s3_destination_key: str,
delimiter: str='\t'
) -> str | None:
"""Convert and xlsx file to csv/tsv file.
Args:
source (str): XLSX filepath to convert
destination (str): Destination CSV/TSV filepath
s3_source_bucket (str): source xlsx file s3 bucket
s3_source_key (str): source xlsx file s3 key
s3_destination_bucket (str): destination xlsx file s3 bucket
s3_destination_key (str): destination xlsx file s3 key
delimiter (str, optional): Delimiter. Defaults to '\t'.
Returns:
Expand All @@ -1635,27 +1645,31 @@ def xlsx_to_tsv(source: str, destination: str, delimiter: str='\t') -> str | Non
try:
records = []
header = None
workbook = load_workbook(source, read_only=True)
for sheet in workbook:
sheet.calculate_dimension(force=True)

for idx, row in enumerate(sheet.values):
items = [str(value) if value is not None else "" for value in row]

if idx == 0:
if header is None:
header = items
elif header != items:
raise ValueError("Excel sheets must contain the exact same header")
else:
continue

records.append(items)
workbook.close()

with open(destination, 'w', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=delimiter)

with NamedTemporaryFile(suffix='.xlsx') as tmp:
await download_file(bucket=s3_source_bucket, filepath=tmp.name, s3_key=s3_source_key)
workbook = load_workbook(tmp.name, read_only=True)
for sheet in workbook:
sheet.calculate_dimension(force=True)

for idx, row in enumerate(sheet.values):
items = [str(value) if value is not None else "" for value in row]

if idx == 0:
if header is None:
header = items
elif header != items:
raise ValueError("Excel sheets must contain the exact same header")
else:
continue

records.append(items)
workbook.close()

with NamedTemporaryFile(mode='w') as tmp:
writer = csv.writer(tmp, delimiter=delimiter)
writer.writerows(records)
await upload_file(bucket=s3_destination_bucket, filepath=tmp.name, s3_key=s3_destination_key)

except Exception as err:
print(err)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
long_description = fileobj.read()

setup(name='aioradio',
version='0.17.18',
version='0.17.19',
description='Generic asynchronous i/o python utilities for AWS services (SQS, S3, DynamoDB, Secrets Manager), Redis, MSSQL (pyodbc), JIRA and more',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down

0 comments on commit 7e5f0ba

Please sign in to comment.