From 9369b56c446adbdcd06527cae6f5d6d96e484a6c Mon Sep 17 00:00:00 2001 From: Ezepheros Date: Thu, 13 Jun 2024 09:52:55 +0800 Subject: [PATCH 1/9] add dir --- src/data/download.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/data/download.py diff --git a/src/data/download.py b/src/data/download.py new file mode 100644 index 0000000..e69de29 From b36a7fdfc41c185ffc9454ef6447a3e45e25faa3 Mon Sep 17 00:00:00 2001 From: Ezepheros Date: Thu, 13 Jun 2024 13:09:33 +0800 Subject: [PATCH 2/9] add download files --- .gitignore | 5 + .../download.py => beatylm2/data/__init__.py | 0 beatylm2/data/download.py | 96 +++++++++++++++++++ beatylm2/utils/__init__.py | 0 beatylm2/utils/data_utils.py | 24 +++++ requirements.txt | 3 + setup.py | 13 +++ 7 files changed, 141 insertions(+) rename src/data/download.py => beatylm2/data/__init__.py (100%) create mode 100644 beatylm2/data/download.py create mode 100644 beatylm2/utils/__init__.py create mode 100644 beatylm2/utils/data_utils.py create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 82f9275..8e6c66e 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,8 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# additional + +# data +*.jpg \ No newline at end of file diff --git a/src/data/download.py b/beatylm2/data/__init__.py similarity index 100% rename from src/data/download.py rename to beatylm2/data/__init__.py diff --git a/beatylm2/data/download.py b/beatylm2/data/download.py new file mode 100644 index 0000000..ecb0548 --- /dev/null +++ b/beatylm2/data/download.py @@ -0,0 +1,96 @@ +from beatylm2.utils.data_utils import get_image_list +from pathlib import Path +import requests +from typing import List, Union +import os +import time +import random +from tqdm import tqdm + +''' Definition of get_image_list from bbl-ml-demo/beatylm2/utils/data_utils.py: +from pygbif import occurrences as occ + +def get_image_list(**kwargs): + # For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/ + + # default search arguments + search_args = { + "datasetKey": "07fd0d79-4883-435f-bba1-58fef110cd13", + "mediaType": "StillImage", + "limit": 1000, + } + + # override default search arguments with user-provided arguments + search_args.update(kwargs) + + # search for occurrences + query = occ.search(**search_args) + + # extract the list of image files + list_of_files = [] + for record in query["results"]: + list_of_files.append(record["catalogNumber"] + ".jpg") + + return list_of_files''' + +def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs): + + # ensure the destination directory exists + dst_path = Path(dst_path) + os.makedirs(dst_path, exist_ok=True) + + # download images + print(f"Downloading {num_samples} images to '{os.path.abspath(dst_path)}'...") + print(f"\tNote* there are only ~35,000 images in the database") + print(f"\tNote* that many images are not available which may slow down the download process") + + files_downloaded = [] + num_downloaded = 0 + num_tries = 0 # number of images tried to download, can be used to set begining offset for future downloads + offset = kwargs.get("offset", 0) + + while num_downloaded < num_samples: + # get list of images to download + image_files = get_image_list(**kwargs) + if not image_files: + print("No more image files to download.") + break + + # download images in the list + with tqdm(total=num_samples, desc="Downloading images", initial=num_downloaded, unit="image") as pbar: + for image_file in image_files: + + url = f"https://beaty.b-cdn.net/{image_file}" + try: + response = requests.get(url, stream=True) + except Exception as e: + print(f"An error occurred while downloading {url}: {e}") + return len(files_downloaded), files_downloaded, num_tries + + num_tries += 1 + + if response.status_code == 200: + with open(dst_path / image_file, "wb") as f: + f.write(response.content) + + files_downloaded.append(image_file) + num_downloaded += 1 + pbar.update(1) + + if num_downloaded >= num_samples: + break + + # add a random delay to avoid overloading the server + time.sleep(random.uniform(0.1, 1)) + + offset += len(image_files) + kwargs["offset"] = offset + + return len(files_downloaded), files_downloaded, num_tries + +if __name__ == "__main__": + num, files, num_tries = download_images("data/images", num_samples=1000, limit=200) + print(f"Downloaded: {num} images") + print(f"Files: {files}") + print(f"final offset: {num_tries}") + \ No newline at end of file diff --git a/beatylm2/utils/__init__.py b/beatylm2/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/beatylm2/utils/data_utils.py b/beatylm2/utils/data_utils.py new file mode 100644 index 0000000..accb20f --- /dev/null +++ b/beatylm2/utils/data_utils.py @@ -0,0 +1,24 @@ +from pygbif import occurrences as occ + +def get_image_list(**kwargs): + # For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/ + + # default search arguments + search_args = { + "datasetKey": "07fd0d79-4883-435f-bba1-58fef110cd13", + "mediaType": "StillImage", + "limit": 1000, + } + + # override default search arguments with user-provided arguments + search_args.update(kwargs) + + # search for occurrences + query = occ.search(**search_args) + + # extract the list of image files + list_of_files = [] + for record in query["results"]: + list_of_files.append(record["catalogNumber"] + ".jpg") + + return list_of_files \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4c8ee97 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +# python_version == "python==3.10.14" +pygbif==0.6.4 +tqdm==4.66.4 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c94df86 --- /dev/null +++ b/setup.py @@ -0,0 +1,13 @@ +from setuptools import setup, find_packages + +setup( + name="beatylm2", + version="0.1", + packages=find_packages(), + install_requires=[ + "pygbif==0.6.4", + "tqdm==4.66.4", + # add other dependencies here + ], + python_requires="==3.10.*", +) From daffa328086c5338fdbefc3100b2e41d8a352030 Mon Sep 17 00:00:00 2001 From: Ethan Zhao <75817837+Ezepheros@users.noreply.github.com> Date: Sat, 13 Jul 2024 12:40:54 +0800 Subject: [PATCH 3/9] Update README.md for downloading data --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 021be9b..246559b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ # UBC Beaty Biodiversity Museum - ML Project Demo A proof of concept project to utilize machine learning techniques for handling, organizing, and presenting large quantities of herbarium data. + +## Data Download +To download images from the beaty collection: +1. Clone the repo +2. Run `git checkout ethan-dev` +3. `cd` into the repo +4. Make sure you are in the venv or conda env you want to work with +5. Run `pip install -e .` + - this will install the repo as a package, the -e flag tells it that you are editing the package so that when you make changes, you don't have to reinstall the package each time +6. Run `cd data` +7. Run `python download.py` + - If you want to change the number of images you want to download, you have to modify the 'num_samples' parameter in the main function From 588cc429fc1075e7b9db116a184e5de7ad09d219 Mon Sep 17 00:00:00 2001 From: Ethan Zhao <75817837+Ezepheros@users.noreply.github.com> Date: Sat, 13 Jul 2024 12:41:46 +0800 Subject: [PATCH 4/9] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 246559b..3bbbd9c 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ A proof of concept project to utilize machine learning techniques for handling, ## Data Download To download images from the beaty collection: 1. Clone the repo -2. Run `git checkout ethan-dev` -3. `cd` into the repo +2. `cd` into the repo +3. Run `git checkout ethan-dev` 4. Make sure you are in the venv or conda env you want to work with 5. Run `pip install -e .` - this will install the repo as a package, the -e flag tells it that you are editing the package so that when you make changes, you don't have to reinstall the package each time From a9b516098db0525d20d12cdaecf89b5fe8423823 Mon Sep 17 00:00:00 2001 From: Ethan Zhao <75817837+Ezepheros@users.noreply.github.com> Date: Sat, 13 Jul 2024 12:43:23 +0800 Subject: [PATCH 5/9] Update README.md From a299693f62bcab30bf5ccfea7a67cd82c4955b3c Mon Sep 17 00:00:00 2001 From: Ezepheros Date: Sat, 13 Jul 2024 12:46:23 +0800 Subject: [PATCH 6/9] change default samples downloaded --- beatylm2/data/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beatylm2/data/download.py b/beatylm2/data/download.py index ecb0548..634eacc 100644 --- a/beatylm2/data/download.py +++ b/beatylm2/data/download.py @@ -89,7 +89,7 @@ def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs): return len(files_downloaded), files_downloaded, num_tries if __name__ == "__main__": - num, files, num_tries = download_images("data/images", num_samples=1000, limit=200) + num, files, num_tries = download_images("data/images", num_samples=50, limit=200) print(f"Downloaded: {num} images") print(f"Files: {files}") print(f"final offset: {num_tries}") From ca9a59467a97da24b62f4c20b45e62dc6c1be78a Mon Sep 17 00:00:00 2001 From: Ezepheros Date: Sat, 13 Jul 2024 12:48:40 +0800 Subject: [PATCH 7/9] add doc string --- beatylm2/utils/data_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/beatylm2/utils/data_utils.py b/beatylm2/utils/data_utils.py index accb20f..d64ba3c 100644 --- a/beatylm2/utils/data_utils.py +++ b/beatylm2/utils/data_utils.py @@ -1,7 +1,8 @@ from pygbif import occurrences as occ def get_image_list(**kwargs): - # For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/ + ''' For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/ + ''' # default search arguments search_args = { From e3ea19be68661dc16944e34a834a3d6d31650660 Mon Sep 17 00:00:00 2001 From: Ezepheros Date: Sat, 13 Jul 2024 12:50:41 +0800 Subject: [PATCH 8/9] add docstring --- beatylm2/data/download.py | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/beatylm2/data/download.py b/beatylm2/data/download.py index 634eacc..d5327f5 100644 --- a/beatylm2/data/download.py +++ b/beatylm2/data/download.py @@ -7,33 +7,14 @@ import random from tqdm import tqdm -''' Definition of get_image_list from bbl-ml-demo/beatylm2/utils/data_utils.py: -from pygbif import occurrences as occ - -def get_image_list(**kwargs): - # For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/ - - # default search arguments - search_args = { - "datasetKey": "07fd0d79-4883-435f-bba1-58fef110cd13", - "mediaType": "StillImage", - "limit": 1000, - } - - # override default search arguments with user-provided arguments - search_args.update(kwargs) - - # search for occurrences - query = occ.search(**search_args) - - # extract the list of image files - list_of_files = [] - for record in query["results"]: - list_of_files.append(record["catalogNumber"] + ".jpg") - - return list_of_files''' - def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs): + ''' + Download images from the Beaty Biodiversity Museum collection. + Args: + dst_path (Union[Path, str]): The destination directory to save the images. + num_samples (int): The number of images to download. + **kwargs: Additional keyword arguments to pass to the get_image_list function. see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/ + ''' # ensure the destination directory exists dst_path = Path(dst_path) From 1d5a83da59d45c3822f43fae65ceddde49af3e7b Mon Sep 17 00:00:00 2001 From: Ezepheros Date: Sat, 13 Jul 2024 12:56:28 +0800 Subject: [PATCH 9/9] add docstring --- beatylm2/data/download.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/beatylm2/data/download.py b/beatylm2/data/download.py index d5327f5..4ca44c9 100644 --- a/beatylm2/data/download.py +++ b/beatylm2/data/download.py @@ -10,10 +10,10 @@ def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs): ''' Download images from the Beaty Biodiversity Museum collection. - Args: - dst_path (Union[Path, str]): The destination directory to save the images. - num_samples (int): The number of images to download. - **kwargs: Additional keyword arguments to pass to the get_image_list function. see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/ + args: + - dst_path (Union[Path, str]): The destination directory to save the images. + - num_samples (int): The number of images to download. + - **kwargs: Additional keyword arguments to pass to the get_image_list function. see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/ ''' # ensure the destination directory exists