From 21a290648276fb23646241a983315835940b1225 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Wed, 20 Mar 2024 21:45:03 +0000 Subject: [PATCH 01/15] update main --- apps/openchallenges/edam-etl/src/main.py | 103 +++++++++++++++++++++-- 1 file changed, 94 insertions(+), 9 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 191baca385..5583f232c1 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -1,16 +1,101 @@ """Extract, transform and load EDAM concepts""" +import pandas as pd +import requests from os import getenv +from typing import Optional -# Get config from the environment variables -oc_db_url = getenv("OC_DB_URL") -print(f"OC DB URL: {oc_db_url}") +"""Get config from the environment variables""" -# TODO Download EDAM concepts from GitHub or S3 bucket (CSV file) -print("[TODO] Download EDAM concepts from GitHub or S3 bucket (CSV file)") +OC_DB_URL = getenv("OC_DB_URL") +VERSION = getenv("OC_DB_VERSION") +print(f"EDAM Version: {VERSION}") +print(f"OC DB URL: {OC_DB_URL}") -# TODO Process the EDAM concepts -print("[TODO] Process the EDAM concepts") -# TODO Push the EDAM concept to the OC challenge service DB -print("[TODO] Push the EDAM concept to the OC challenge service DB") +def download_edam_csv(url: str, version: str) -> Optional[bool]: + """Download EDAM concepts from GitHub or S3 bucket (CSV file)""" + print("Downloading the EDAM concepts from GitHub (CSV file)...") + try: + response = requests.get(url) + response.raise_for_status() # Raise an exception for bad response + with open(f"EDAM_{version}.csv", "wb") as f: + f.write(response.content) + print("EDAM concepts downloaded successfully.") + except requests.RequestException as e: + print(f"Error downloading EDAM concepts: {e}") + return None + + +def transform_to_dataframe(version: str) -> pd.DataFrame: + """Transform the CSV to a DataFrame with indices starting from 1""" + print("Processing the EDAM concepts...") + try: + df = ( + pd.read_csv(f"EDAM_{version}.csv", usecols=["Class ID", "Preferred Label"]) + .rename( + columns={"Class ID": "class_id", "Preferred Label": "preferred_label"} + ) + .assign(id=lambda x: x.reset_index(drop=True).index + 1) + ) + print("EDAM concepts processed successfully.") + return df + except FileNotFoundError: +<<<<<<< HEAD + print(f"File EDAM_{VERSION}.csv not found.") + except Exception as e: + print(f"Error processing EDAM concepts: {e}") + + +def print_info_statistics(df: pd.DataFrame) -> None: + """Gather data about the EDAM ontology""" + if df is not None: + print(f"Number of Concepts Transformed: {len(df)}") + print(f"Column names: {df.columns.tolist()}") + print("Statistics:") + # Set the display options to show only 2 decimal places + pd.set_option("display.float_format", "{:.0f}".format) + print(df.describe()) + else: + print("No data available.") + + +def main() -> None: + """Main function to excute preceeding functions""" + url: str = ( + f"https://github.com/edamontology/edamontology/raw/main/releases/EDAM_{VERSION}.csv" + ) + download_edam_csv(url, VERSION) + df: pd.DataFrame = transform_to_dataframe(VERSION) + print_info_statistics(df) +======= + print(f"File EDAM_{version}.csv not found.") + except Exception as e: + print(f"Error processing EDAM concepts: {e}") + + +def print_info_statistics(df: pd.DataFrame) -> None: + """Gather data about the EDAM ontology""" + if df is not None: + print(f"Number of Concepts Transformed: {len(df)}") + print(f"Column names: {df.columns.tolist()}") + print("Statistics:") + # Set the display options to show only 2 decimal places + pd.set_option("display.float_format", "{:.0f}".format) + print(df.describe()) + else: + print("No data available.") + + + +def main() -> None: + """Main function to excute preceeding functions""" + url: str = ( + f"https://github.com/edamontology/edamontology/raw/main/releases/EDAM_{VERSION}.csv" + ) + download_edam_csv(url, VERSION) + df: pd.DataFrame = transform_to_dataframe(VERSION) + print_info_statistics(df) + +if __name__ == "__main__": + main() From eaee6522b013900a20d115ca5fa9f8feb17f1a7a Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Wed, 20 Mar 2024 21:46:44 +0000 Subject: [PATCH 02/15] update the dataframe creation --- apps/openchallenges/edam-etl/src/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 5583f232c1..f0d44e430b 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -5,7 +5,7 @@ from os import getenv from typing import Optional -"""Get config from the environment variables""" +# Get config from the environment variables OC_DB_URL = getenv("OC_DB_URL") VERSION = getenv("OC_DB_VERSION") @@ -94,7 +94,7 @@ def main() -> None: f"https://github.com/edamontology/edamontology/raw/main/releases/EDAM_{VERSION}.csv" ) download_edam_csv(url, VERSION) - df: pd.DataFrame = transform_to_dataframe(VERSION) + df = transform_to_dataframe(VERSION) print_info_statistics(df) if __name__ == "__main__": From 178651ae75e45ff6830f218ec0b426ab06ea6609 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Wed, 20 Mar 2024 21:49:56 +0000 Subject: [PATCH 03/15] update dependencies --- apps/openchallenges/edam-etl/poetry.lock | 323 +++++++++++++++++++- apps/openchallenges/edam-etl/pyproject.toml | 2 + 2 files changed, 323 insertions(+), 2 deletions(-) diff --git a/apps/openchallenges/edam-etl/poetry.lock b/apps/openchallenges/edam-etl/poetry.lock index 79a49f5137..e4c5f75868 100644 --- a/apps/openchallenges/edam-etl/poetry.lock +++ b/apps/openchallenges/edam-etl/poetry.lock @@ -1,7 +1,326 @@ # This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. -package = [] + +[[package]] +name = "certifi" +version = "2024.2.2" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"}, + {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.3.2" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, + {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, +] + +[[package]] +name = "idna" +version = "3.6" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ + {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"}, + {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, +] + +[[package]] +name = "numpy" +version = "1.26.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, + {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"}, + {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"}, + {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"}, + {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"}, + {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, + {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, + {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, + {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, + {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, + {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, +] + +[[package]] +name = "pandas" +version = "2.2.1" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"}, + {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"}, + {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"}, + {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"}, + {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"}, + {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"}, + {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"}, + {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"}, + {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"}, + {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"}, + {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"}, + {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"}, + {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"}, + {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"}, + {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"}, + {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"}, + {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"}, + {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"}, + {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"}, + {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"}, + {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"}, + {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"}, + {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"}, + {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"}, + {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"}, + {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"}, + {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"}, + {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"}, + {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"}, +] + +[package.dependencies] +numpy = {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""} +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.7" + +[package.extras] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + +[[package]] +name = "requests" +version = "2.31.0" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, + {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, +] + +[[package]] +name = "urllib3" +version = "2.2.1" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"}, + {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "3.12.0" -content-hash = "309c47370d65aad6d832387e15ee63a4c8f13272797ab15aa7cc7b022a41e95b" +content-hash = "ceebffa66bc6efce9066f91ceffdf8f9c6976f9487993e69c76b724d2eb77d3f" diff --git a/apps/openchallenges/edam-etl/pyproject.toml b/apps/openchallenges/edam-etl/pyproject.toml index 5e3711c4db..344460f6dd 100644 --- a/apps/openchallenges/edam-etl/pyproject.toml +++ b/apps/openchallenges/edam-etl/pyproject.toml @@ -8,6 +8,8 @@ packages = [{include = "src"}] [tool.poetry.dependencies] python = "3.12.0" +pandas = "2.2.1" +requests = "2.31.0" [tool.poetry.group.dev.dependencies] From 698f373b400bc19aa39d9e5b95484430fc24bfa6 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Wed, 20 Mar 2024 21:51:13 +0000 Subject: [PATCH 04/15] add changes to main --- apps/openchallenges/edam-etl/src/main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index f0d44e430b..4ab58711cd 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -41,7 +41,6 @@ def transform_to_dataframe(version: str) -> pd.DataFrame: print("EDAM concepts processed successfully.") return df except FileNotFoundError: -<<<<<<< HEAD print(f"File EDAM_{VERSION}.csv not found.") except Exception as e: print(f"Error processing EDAM concepts: {e}") @@ -68,8 +67,6 @@ def main() -> None: download_edam_csv(url, VERSION) df: pd.DataFrame = transform_to_dataframe(VERSION) print_info_statistics(df) -======= - print(f"File EDAM_{version}.csv not found.") except Exception as e: print(f"Error processing EDAM concepts: {e}") From d8e7f34782707a122a954670f3f87f077a8c8371 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Wed, 20 Mar 2024 22:00:32 +0000 Subject: [PATCH 05/15] update url --- apps/openchallenges/edam-etl/src/main.py | 49 ++++++------------------ 1 file changed, 12 insertions(+), 37 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 4ab58711cd..43f48a518d 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -1,5 +1,3 @@ -"""Extract, transform and load EDAM concepts""" - import pandas as pd import requests from os import getenv @@ -8,7 +6,7 @@ # Get config from the environment variables OC_DB_URL = getenv("OC_DB_URL") -VERSION = getenv("OC_DB_VERSION") +VERSION = getenv("VERSION") print(f"EDAM Version: {VERSION}") print(f"OC DB URL: {OC_DB_URL}") @@ -22,6 +20,7 @@ def download_edam_csv(url: str, version: str) -> Optional[bool]: with open(f"EDAM_{version}.csv", "wb") as f: f.write(response.content) print("EDAM concepts downloaded successfully.") + return True except requests.RequestException as e: print(f"Error downloading EDAM concepts: {e}") return None @@ -36,14 +35,15 @@ def transform_to_dataframe(version: str) -> pd.DataFrame: .rename( columns={"Class ID": "class_id", "Preferred Label": "preferred_label"} ) - .assign(id=lambda x: x.reset_index(drop=True).index + 1) + .reset_index(drop=True) + .reset_index(drop=False) + .rename(columns={"index": "id"}) ) print("EDAM concepts processed successfully.") return df - except FileNotFoundError: - print(f"File EDAM_{VERSION}.csv not found.") - except Exception as e: + except FileNotFoundError as e: print(f"Error processing EDAM concepts: {e}") + return None def print_info_statistics(df: pd.DataFrame) -> None: @@ -60,39 +60,14 @@ def print_info_statistics(df: pd.DataFrame) -> None: def main() -> None: - """Main function to excute preceeding functions""" + """Main function to execute preceding functions""" url: str = ( - f"https://github.com/edamontology/edamontology/raw/main/releases/EDAM_{VERSION}.csv" + f"https://raw.githubusercontent.com/Sage-Bionetworks/edamontology/main/releases/EDAM_{VERSION}.csv" ) - download_edam_csv(url, VERSION) - df: pd.DataFrame = transform_to_dataframe(VERSION) - print_info_statistics(df) - except Exception as e: - print(f"Error processing EDAM concepts: {e}") + if download_edam_csv(url, VERSION): + df: pd.DataFrame = transform_to_dataframe(VERSION) + print_info_statistics(df) -def print_info_statistics(df: pd.DataFrame) -> None: - """Gather data about the EDAM ontology""" - if df is not None: - print(f"Number of Concepts Transformed: {len(df)}") - print(f"Column names: {df.columns.tolist()}") - print("Statistics:") - # Set the display options to show only 2 decimal places - pd.set_option("display.float_format", "{:.0f}".format) - print(df.describe()) - else: - print("No data available.") - - - -def main() -> None: - """Main function to excute preceeding functions""" - url: str = ( - f"https://github.com/edamontology/edamontology/raw/main/releases/EDAM_{VERSION}.csv" - ) - download_edam_csv(url, VERSION) - df = transform_to_dataframe(VERSION) - print_info_statistics(df) - if __name__ == "__main__": main() From 8eb2ca54af52c334c7908459fe9205b9181a8c98 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Wed, 20 Mar 2024 22:16:14 +0000 Subject: [PATCH 06/15] update return value --- apps/openchallenges/edam-etl/src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 43f48a518d..c383b10022 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -26,7 +26,7 @@ def download_edam_csv(url: str, version: str) -> Optional[bool]: return None -def transform_to_dataframe(version: str) -> pd.DataFrame: +def transform_to_dataframe(version: str) -> pd.DataFrame | None: """Transform the CSV to a DataFrame with indices starting from 1""" print("Processing the EDAM concepts...") try: From 2cdf0aeffa1a1d9fd1bd4e1b7de6abde84f6565d Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Thu, 21 Mar 2024 16:13:36 +0000 Subject: [PATCH 07/15] update the concept counts --- apps/openchallenges/edam-etl/src/main.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index c383b10022..939cbb499f 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -51,10 +51,21 @@ def print_info_statistics(df: pd.DataFrame) -> None: if df is not None: print(f"Number of Concepts Transformed: {len(df)}") print(f"Column names: {df.columns.tolist()}") - print("Statistics:") - # Set the display options to show only 2 decimal places - pd.set_option("display.float_format", "{:.0f}".format) - print(df.describe()) + + # Count occurrences of specific concepts + concept_counts = df["preferred_label"].str.lower().value_counts() + + # Print counts of specific concepts + print("\nConcept Counts:") + for concept in ["Data", "Operation", "Format"]: + concept_count = concept_counts.get(concept.lower(), 0) + print(f"{concept}: {concept_count}") + + # Print counts of other concepts + other_count = sum(concept_counts) - sum( + concept_counts[["data", "operation", "format"]].fillna(0) + ) + print(f"Other: {other_count}") else: print("No data available.") From ddfebbcbda010d701ee5f82acee3b460e7869abc Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Thu, 21 Mar 2024 18:17:27 +0000 Subject: [PATCH 08/15] update search using regex --- apps/openchallenges/edam-etl/poetry.lock | 104 +++++++++++++++++++- apps/openchallenges/edam-etl/pyproject.toml | 1 + apps/openchallenges/edam-etl/src/main.py | 38 ++++--- 3 files changed, 131 insertions(+), 12 deletions(-) diff --git a/apps/openchallenges/edam-etl/poetry.lock b/apps/openchallenges/edam-etl/poetry.lock index e4c5f75868..07228c961f 100644 --- a/apps/openchallenges/edam-etl/poetry.lock +++ b/apps/openchallenges/edam-etl/poetry.lock @@ -260,6 +260,108 @@ files = [ {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, ] +[[package]] +name = "regex" +version = "2023.12.25" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.7" +files = [ + {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5"}, + {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b014333bd0217ad3d54c143de9d4b9a3ca1c5a29a6d0d554952ea071cff0f1f8"}, + {file = "regex-2023.12.25-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d865984b3f71f6d0af64d0d88f5733521698f6c16f445bb09ce746c92c97c586"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e0eabac536b4cc7f57a5f3d095bfa557860ab912f25965e08fe1545e2ed8b4c"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c25a8ad70e716f96e13a637802813f65d8a6760ef48672aa3502f4c24ea8b400"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9b6d73353f777630626f403b0652055ebfe8ff142a44ec2cf18ae470395766e"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9cc99d6946d750eb75827cb53c4371b8b0fe89c733a94b1573c9dd16ea6c9e4"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88d1f7bef20c721359d8675f7d9f8e414ec5003d8f642fdfd8087777ff7f94b5"}, + {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cb3fe77aec8f1995611f966d0c656fdce398317f850d0e6e7aebdfe61f40e1cd"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7aa47c2e9ea33a4a2a05f40fcd3ea36d73853a2aae7b4feab6fc85f8bf2c9704"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:df26481f0c7a3f8739fecb3e81bc9da3fcfae34d6c094563b9d4670b047312e1"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c40281f7d70baf6e0db0c2f7472b31609f5bc2748fe7275ea65a0b4601d9b392"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:d94a1db462d5690ebf6ae86d11c5e420042b9898af5dcf278bd97d6bda065423"}, + {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ba1b30765a55acf15dce3f364e4928b80858fa8f979ad41f862358939bdd1f2f"}, + {file = "regex-2023.12.25-cp310-cp310-win32.whl", hash = "sha256:150c39f5b964e4d7dba46a7962a088fbc91f06e606f023ce57bb347a3b2d4630"}, + {file = "regex-2023.12.25-cp310-cp310-win_amd64.whl", hash = "sha256:09da66917262d9481c719599116c7dc0c321ffcec4b1f510c4f8a066f8768105"}, + {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1b9d811f72210fa9306aeb88385b8f8bcef0dfbf3873410413c00aa94c56c2b6"}, + {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d902a43085a308cef32c0d3aea962524b725403fd9373dea18110904003bac97"}, + {file = "regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d166eafc19f4718df38887b2bbe1467a4f74a9830e8605089ea7a30dd4da8887"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7ad32824b7f02bb3c9f80306d405a1d9b7bb89362d68b3c5a9be53836caebdb"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:636ba0a77de609d6510235b7f0e77ec494d2657108f777e8765efc060094c98c"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fda75704357805eb953a3ee15a2b240694a9a514548cd49b3c5124b4e2ad01b"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f72cbae7f6b01591f90814250e636065850c5926751af02bb48da94dfced7baa"}, + {file = "regex-2023.12.25-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db2a0b1857f18b11e3b0e54ddfefc96af46b0896fb678c85f63fb8c37518b3e7"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7502534e55c7c36c0978c91ba6f61703faf7ce733715ca48f499d3dbbd7657e0"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e8c7e08bb566de4faaf11984af13f6bcf6a08f327b13631d41d62592681d24fe"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:283fc8eed679758de38fe493b7d7d84a198b558942b03f017b1f94dda8efae80"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f44dd4d68697559d007462b0a3a1d9acd61d97072b71f6d1968daef26bc744bd"}, + {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:67d3ccfc590e5e7197750fcb3a2915b416a53e2de847a728cfa60141054123d4"}, + {file = "regex-2023.12.25-cp311-cp311-win32.whl", hash = "sha256:68191f80a9bad283432385961d9efe09d783bcd36ed35a60fb1ff3f1ec2efe87"}, + {file = "regex-2023.12.25-cp311-cp311-win_amd64.whl", hash = "sha256:7d2af3f6b8419661a0c421584cfe8aaec1c0e435ce7e47ee2a97e344b98f794f"}, + {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8a0ccf52bb37d1a700375a6b395bff5dd15c50acb745f7db30415bae3c2b0715"}, + {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c3c4a78615b7762740531c27cf46e2f388d8d727d0c0c739e72048beb26c8a9d"}, + {file = "regex-2023.12.25-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad83e7545b4ab69216cef4cc47e344d19622e28aabec61574b20257c65466d6a"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7a635871143661feccce3979e1727c4e094f2bdfd3ec4b90dfd4f16f571a87a"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d498eea3f581fbe1b34b59c697512a8baef88212f92e4c7830fcc1499f5b45a5"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:43f7cd5754d02a56ae4ebb91b33461dc67be8e3e0153f593c509e21d219c5060"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51f4b32f793812714fd5307222a7f77e739b9bc566dc94a18126aba3b92b98a3"}, + {file = "regex-2023.12.25-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba99d8077424501b9616b43a2d208095746fb1284fc5ba490139651f971d39d9"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4bfc2b16e3ba8850e0e262467275dd4d62f0d045e0e9eda2bc65078c0110a11f"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8c2c19dae8a3eb0ea45a8448356ed561be843b13cbc34b840922ddf565498c1c"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:60080bb3d8617d96f0fb7e19796384cc2467447ef1c491694850ebd3670bc457"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b77e27b79448e34c2c51c09836033056a0547aa360c45eeeb67803da7b0eedaf"}, + {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:518440c991f514331f4850a63560321f833979d145d7d81186dbe2f19e27ae3d"}, + {file = "regex-2023.12.25-cp312-cp312-win32.whl", hash = "sha256:e2610e9406d3b0073636a3a2e80db05a02f0c3169b5632022b4e81c0364bcda5"}, + {file = "regex-2023.12.25-cp312-cp312-win_amd64.whl", hash = "sha256:cc37b9aeebab425f11f27e5e9e6cf580be7206c6582a64467a14dda211abc232"}, + {file = "regex-2023.12.25-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:da695d75ac97cb1cd725adac136d25ca687da4536154cdc2815f576e4da11c69"}, + {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d126361607b33c4eb7b36debc173bf25d7805847346dd4d99b5499e1fef52bc7"}, + {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4719bb05094d7d8563a450cf8738d2e1061420f79cfcc1fa7f0a44744c4d8f73"}, + {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dd58946bce44b53b06d94aa95560d0b243eb2fe64227cba50017a8d8b3cd3e2"}, + {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22a86d9fff2009302c440b9d799ef2fe322416d2d58fc124b926aa89365ec482"}, + {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2aae8101919e8aa05ecfe6322b278f41ce2994c4a430303c4cd163fef746e04f"}, + {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e692296c4cc2873967771345a876bcfc1c547e8dd695c6b89342488b0ea55cd8"}, + {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:263ef5cc10979837f243950637fffb06e8daed7f1ac1e39d5910fd29929e489a"}, + {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d6f7e255e5fa94642a0724e35406e6cb7001c09d476ab5fce002f652b36d0c39"}, + {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:88ad44e220e22b63b0f8f81f007e8abbb92874d8ced66f32571ef8beb0643b2b"}, + {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:3a17d3ede18f9cedcbe23d2daa8a2cd6f59fe2bf082c567e43083bba3fb00347"}, + {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d15b274f9e15b1a0b7a45d2ac86d1f634d983ca40d6b886721626c47a400bf39"}, + {file = "regex-2023.12.25-cp37-cp37m-win32.whl", hash = "sha256:ed19b3a05ae0c97dd8f75a5d8f21f7723a8c33bbc555da6bbe1f96c470139d3c"}, + {file = "regex-2023.12.25-cp37-cp37m-win_amd64.whl", hash = "sha256:a6d1047952c0b8104a1d371f88f4ab62e6275567d4458c1e26e9627ad489b445"}, + {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b43523d7bc2abd757119dbfb38af91b5735eea45537ec6ec3a5ec3f9562a1c53"}, + {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:efb2d82f33b2212898f1659fb1c2e9ac30493ac41e4d53123da374c3b5541e64"}, + {file = "regex-2023.12.25-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b7fca9205b59c1a3d5031f7e64ed627a1074730a51c2a80e97653e3e9fa0d415"}, + {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086dd15e9435b393ae06f96ab69ab2d333f5d65cbe65ca5a3ef0ec9564dfe770"}, + {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e81469f7d01efed9b53740aedd26085f20d49da65f9c1f41e822a33992cb1590"}, + {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:34e4af5b27232f68042aa40a91c3b9bb4da0eeb31b7632e0091afc4310afe6cb"}, + {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9852b76ab558e45b20bf1893b59af64a28bd3820b0c2efc80e0a70a4a3ea51c1"}, + {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff100b203092af77d1a5a7abe085b3506b7eaaf9abf65b73b7d6905b6cb76988"}, + {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cc038b2d8b1470364b1888a98fd22d616fba2b6309c5b5f181ad4483e0017861"}, + {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:094ba386bb5c01e54e14434d4caabf6583334090865b23ef58e0424a6286d3dc"}, + {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5cd05d0f57846d8ba4b71d9c00f6f37d6b97d5e5ef8b3c3840426a475c8f70f4"}, + {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:9aa1a67bbf0f957bbe096375887b2505f5d8ae16bf04488e8b0f334c36e31360"}, + {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:98a2636994f943b871786c9e82bfe7883ecdaba2ef5df54e1450fa9869d1f756"}, + {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37f8e93a81fc5e5bd8db7e10e62dc64261bcd88f8d7e6640aaebe9bc180d9ce2"}, + {file = "regex-2023.12.25-cp38-cp38-win32.whl", hash = "sha256:d78bd484930c1da2b9679290a41cdb25cc127d783768a0369d6b449e72f88beb"}, + {file = "regex-2023.12.25-cp38-cp38-win_amd64.whl", hash = "sha256:b521dcecebc5b978b447f0f69b5b7f3840eac454862270406a39837ffae4e697"}, + {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f7bc09bc9c29ebead055bcba136a67378f03d66bf359e87d0f7c759d6d4ffa31"}, + {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e14b73607d6231f3cc4622809c196b540a6a44e903bcfad940779c80dffa7be7"}, + {file = "regex-2023.12.25-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9eda5f7a50141291beda3edd00abc2d4a5b16c29c92daf8d5bd76934150f3edc"}, + {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc6bb9aa69aacf0f6032c307da718f61a40cf970849e471254e0e91c56ffca95"}, + {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:298dc6354d414bc921581be85695d18912bea163a8b23cac9a2562bbcd5088b1"}, + {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f4e475a80ecbd15896a976aa0b386c5525d0ed34d5c600b6d3ebac0a67c7ddf"}, + {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:531ac6cf22b53e0696f8e1d56ce2396311254eb806111ddd3922c9d937151dae"}, + {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22f3470f7524b6da61e2020672df2f3063676aff444db1daa283c2ea4ed259d6"}, + {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:89723d2112697feaa320c9d351e5f5e7b841e83f8b143dba8e2d2b5f04e10923"}, + {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0ecf44ddf9171cd7566ef1768047f6e66975788258b1c6c6ca78098b95cf9a3d"}, + {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:905466ad1702ed4acfd67a902af50b8db1feeb9781436372261808df7a2a7bca"}, + {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:4558410b7a5607a645e9804a3e9dd509af12fb72b9825b13791a37cd417d73a5"}, + {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:7e316026cc1095f2a3e8cc012822c99f413b702eaa2ca5408a513609488cb62f"}, + {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3b1de218d5375cd6ac4b5493e0b9f3df2be331e86520f23382f216c137913d20"}, + {file = "regex-2023.12.25-cp39-cp39-win32.whl", hash = "sha256:11a963f8e25ab5c61348d090bf1b07f1953929c13bd2309a0662e9ff680763c9"}, + {file = "regex-2023.12.25-cp39-cp39-win_amd64.whl", hash = "sha256:e693e233ac92ba83a87024e1d32b5f9ab15ca55ddd916d878146f4e3406b5c91"}, + {file = "regex-2023.12.25.tar.gz", hash = "sha256:29171aa128da69afdf4bde412d5bedc335f2ca8fcfe4489038577d05f16181e5"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -323,4 +425,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "3.12.0" -content-hash = "ceebffa66bc6efce9066f91ceffdf8f9c6976f9487993e69c76b724d2eb77d3f" +content-hash = "60cfeada182d8e408f67c4cf70ff4d26304af764a67ff223a526f177763c0679" diff --git a/apps/openchallenges/edam-etl/pyproject.toml b/apps/openchallenges/edam-etl/pyproject.toml index 344460f6dd..7cc52ed8a0 100644 --- a/apps/openchallenges/edam-etl/pyproject.toml +++ b/apps/openchallenges/edam-etl/pyproject.toml @@ -10,6 +10,7 @@ packages = [{include = "src"}] python = "3.12.0" pandas = "2.2.1" requests = "2.31.0" +regex = "2023.12.25" [tool.poetry.group.dev.dependencies] diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 939cbb499f..bd5cb52158 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -2,6 +2,7 @@ import requests from os import getenv from typing import Optional +import regex as re # Get config from the environment variables @@ -52,20 +53,35 @@ def print_info_statistics(df: pd.DataFrame) -> None: print(f"Number of Concepts Transformed: {len(df)}") print(f"Column names: {df.columns.tolist()}") - # Count occurrences of specific concepts - concept_counts = df["preferred_label"].str.lower().value_counts() + # Define regex patterns for each concept + data_pattern = re.compile(r"data_", re.IGNORECASE) + operation_pattern = re.compile(r"operation_", re.IGNORECASE) + format_pattern = re.compile(r"format_", re.IGNORECASE) - # Print counts of specific concepts - print("\nConcept Counts:") - for concept in ["Data", "Operation", "Format"]: - concept_count = concept_counts.get(concept.lower(), 0) - print(f"{concept}: {concept_count}") + # Initialize counts + data_count = 0 + operation_count = 0 + format_count = 0 + other_count = 0 - # Print counts of other concepts - other_count = sum(concept_counts) - sum( - concept_counts[["data", "operation", "format"]].fillna(0) - ) + # Loop through the class_id column and count occurrences + for value in df["class_id"]: + if data_pattern.search(value): + data_count += 1 + elif operation_pattern.search(value): + operation_count += 1 + elif format_pattern.search(value): + format_count += 1 + else: + other_count += 1 + + # Print counts of each concept + print("Concept Counts:") + print(f"Data: {data_count}") + print(f"Operation: {operation_count}") + print(f"Format: {format_count}") print(f"Other: {other_count}") + else: print("No data available.") From 2380f116d04a76329520a8d4e8dbcd12c9f46fb2 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Thu, 21 Mar 2024 23:04:06 +0000 Subject: [PATCH 09/15] apply requested changes --- apps/openchallenges/edam-etl/src/main.py | 51 ++++++++++++++---------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index bd5cb52158..85e54c8c76 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -53,33 +53,42 @@ def print_info_statistics(df: pd.DataFrame) -> None: print(f"Number of Concepts Transformed: {len(df)}") print(f"Column names: {df.columns.tolist()}") - # Define regex patterns for each concept - data_pattern = re.compile(r"data_", re.IGNORECASE) - operation_pattern = re.compile(r"operation_", re.IGNORECASE) - format_pattern = re.compile(r"format_", re.IGNORECASE) - - # Initialize counts - data_count = 0 - operation_count = 0 - format_count = 0 - other_count = 0 - - # Loop through the class_id column and count occurrences - for value in df["class_id"]: - if data_pattern.search(value): - data_count += 1 - elif operation_pattern.search(value): - operation_count += 1 - elif format_pattern.search(value): - format_count += 1 - else: - other_count += 1 + # Create regex patterns for each concept + data_pattern = r"data_" + operation_pattern = r"operation_" + format_pattern = r"format_" + topic_pattern = r"topic_" + identifier_pattern = r"identifier_" + + # Use pandas' vectorized string operations to count occurrences + data_count = ( + df["class_id"].str.contains(data_pattern, case=False, na=False).sum() + ) + operation_count = ( + df["class_id"].str.contains(operation_pattern, case=False, na=False).sum() + ) + format_count = ( + df["class_id"].str.contains(format_pattern, case=False, na=False).sum() + ) + topic_count = ( + df["class_id"].str.contains(topic_pattern, case=False, na=False).sum() + ) + identifier_count = ( + df["class_id"].str.contains(identifier_pattern, case=False, na=False).sum() + ) + + # Calculate 'other' count by subtracting the specific counts from the total + other_count = len(df) - ( + data_count + operation_count + format_count + identifier_count + topic_count + ) # Print counts of each concept print("Concept Counts:") print(f"Data: {data_count}") print(f"Operation: {operation_count}") print(f"Format: {format_count}") + print(f"Topic: {format_count}") + print(f"Identifier: {format_count}") print(f"Other: {other_count}") else: From 0be846eecf95e739f41a7ac17686098d16f39caa Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Fri, 22 Mar 2024 15:56:03 +0000 Subject: [PATCH 10/15] add regex true --- apps/openchallenges/edam-etl/src/main.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 85e54c8c76..7c258bea60 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -62,19 +62,27 @@ def print_info_statistics(df: pd.DataFrame) -> None: # Use pandas' vectorized string operations to count occurrences data_count = ( - df["class_id"].str.contains(data_pattern, case=False, na=False).sum() + df["class_id"] + .str.contains(data_pattern, case=False, na=False, regex=True) + .sum() ) operation_count = ( df["class_id"].str.contains(operation_pattern, case=False, na=False).sum() ) format_count = ( - df["class_id"].str.contains(format_pattern, case=False, na=False).sum() + df["class_id"] + .str.contains(format_pattern, case=False, na=False, regex=True) + .sum() ) topic_count = ( - df["class_id"].str.contains(topic_pattern, case=False, na=False).sum() + df["class_id"] + .str.contains(topic_pattern, case=False, na=False, regex=True) + .sum() ) identifier_count = ( - df["class_id"].str.contains(identifier_pattern, case=False, na=False).sum() + df["class_id"] + .str.contains(identifier_pattern, case=False, na=False, regex=True) + .sum() ) # Calculate 'other' count by subtracting the specific counts from the total From 935bc5ed4cce74ac4f84865d040b7cd2fb7ecc36 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Fri, 22 Mar 2024 16:00:04 +0000 Subject: [PATCH 11/15] add / --- apps/openchallenges/edam-etl/src/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 7c258bea60..3e71a3c4b7 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -54,11 +54,11 @@ def print_info_statistics(df: pd.DataFrame) -> None: print(f"Column names: {df.columns.tolist()}") # Create regex patterns for each concept - data_pattern = r"data_" - operation_pattern = r"operation_" - format_pattern = r"format_" - topic_pattern = r"topic_" - identifier_pattern = r"identifier_" + data_pattern = r"/data_" + operation_pattern = r"/operation_" + format_pattern = r"/format_" + topic_pattern = r"/topic_" + identifier_pattern = r"/identifier_" # Use pandas' vectorized string operations to count occurrences data_count = ( From bc5dc825b60656460d8780003834d76e2ac92563 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Fri, 22 Mar 2024 18:45:16 +0000 Subject: [PATCH 12/15] update requested changes --- apps/openchallenges/edam-etl/src/main.py | 47 ++++++++++-------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 3e71a3c4b7..d9a183544c 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -47,6 +47,15 @@ def transform_to_dataframe(version: str) -> pd.DataFrame | None: return None +def count_occurrences(identifier_pattern: str, df) -> int: + """Count the number of pattern occurrences""" + return ( + df["class_id"] + .str.contains(identifier_pattern, case=False, na=False, regex=True) + .sum() + ) + + def print_info_statistics(df: pd.DataFrame) -> None: """Gather data about the EDAM ontology""" if df is not None: @@ -54,36 +63,18 @@ def print_info_statistics(df: pd.DataFrame) -> None: print(f"Column names: {df.columns.tolist()}") # Create regex patterns for each concept - data_pattern = r"/data_" - operation_pattern = r"/operation_" - format_pattern = r"/format_" - topic_pattern = r"/topic_" - identifier_pattern = r"/identifier_" + data_pattern = r"/data_\d+$" + operation_pattern = r"/operation_\d+$" + format_pattern = r"/format_\d+$" + topic_pattern = r"/topic_\d+$" + identifier_pattern = r"/identifier_\d+$" # Use pandas' vectorized string operations to count occurrences - data_count = ( - df["class_id"] - .str.contains(data_pattern, case=False, na=False, regex=True) - .sum() - ) - operation_count = ( - df["class_id"].str.contains(operation_pattern, case=False, na=False).sum() - ) - format_count = ( - df["class_id"] - .str.contains(format_pattern, case=False, na=False, regex=True) - .sum() - ) - topic_count = ( - df["class_id"] - .str.contains(topic_pattern, case=False, na=False, regex=True) - .sum() - ) - identifier_count = ( - df["class_id"] - .str.contains(identifier_pattern, case=False, na=False, regex=True) - .sum() - ) + data_count = count_occurrences(data_pattern, df) + operation_count = count_occurrences(operation_pattern, df) + format_count = count_occurrences(format_pattern, df) + topic_count = count_occurrences(topic_pattern, df) + identifier_count = count_occurrences(identifier_pattern, df) # Calculate 'other' count by subtracting the specific counts from the total other_count = len(df) - ( From 1de7f283ccfaa39cb2153437a2b61c5f6e5a5c5d Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Fri, 22 Mar 2024 18:45:48 +0000 Subject: [PATCH 13/15] remove unused module --- apps/openchallenges/edam-etl/src/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index d9a183544c..80182931fb 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -2,7 +2,6 @@ import requests from os import getenv from typing import Optional -import regex as re # Get config from the environment variables From 76e837f2755cd2d52cb4851eca66e1d5b67f97c7 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Fri, 22 Mar 2024 19:23:55 +0000 Subject: [PATCH 14/15] update changes --- apps/openchallenges/edam-etl/src/main.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 80182931fb..4deb17241c 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -46,13 +46,11 @@ def transform_to_dataframe(version: str) -> pd.DataFrame | None: return None -def count_occurrences(identifier_pattern: str, df) -> int: +def count_occurrences(identifier_pattern: str, column: pd.Series) -> np.int64: """Count the number of pattern occurrences""" - return ( - df["class_id"] - .str.contains(identifier_pattern, case=False, na=False, regex=True) - .sum() - ) + return column.str.contains( + identifier_pattern, case=False, na=False, regex=True + ).sum() def print_info_statistics(df: pd.DataFrame) -> None: @@ -67,13 +65,14 @@ def print_info_statistics(df: pd.DataFrame) -> None: format_pattern = r"/format_\d+$" topic_pattern = r"/topic_\d+$" identifier_pattern = r"/identifier_\d+$" + concept_column = df["class_id"] # Use pandas' vectorized string operations to count occurrences - data_count = count_occurrences(data_pattern, df) - operation_count = count_occurrences(operation_pattern, df) - format_count = count_occurrences(format_pattern, df) - topic_count = count_occurrences(topic_pattern, df) - identifier_count = count_occurrences(identifier_pattern, df) + data_count = count_occurrences(data_pattern, concept_column) + operation_count = count_occurrences(operation_pattern, concept_column) + format_count = count_occurrences(format_pattern, concept_column) + topic_count = count_occurrences(topic_pattern, concept_column) + identifier_count = count_occurrences(identifier_pattern, concept_column) # Calculate 'other' count by subtracting the specific counts from the total other_count = len(df) - ( @@ -88,6 +87,7 @@ def print_info_statistics(df: pd.DataFrame) -> None: print(f"Topic: {format_count}") print(f"Identifier: {format_count}") print(f"Other: {other_count}") + print(type(data_count)) else: print("No data available.") From a0cd52e439c1d167624fdd7a3642139b15974639 Mon Sep 17 00:00:00 2001 From: mdsage1 Date: Fri, 22 Mar 2024 19:24:19 +0000 Subject: [PATCH 15/15] remove type check --- apps/openchallenges/edam-etl/src/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/openchallenges/edam-etl/src/main.py b/apps/openchallenges/edam-etl/src/main.py index 4deb17241c..67bb6ddb8c 100755 --- a/apps/openchallenges/edam-etl/src/main.py +++ b/apps/openchallenges/edam-etl/src/main.py @@ -87,7 +87,6 @@ def print_info_statistics(df: pd.DataFrame) -> None: print(f"Topic: {format_count}") print(f"Identifier: {format_count}") print(f"Other: {other_count}") - print(type(data_count)) else: print("No data available.")