Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ethan dev #1

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,8 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# additional

# data
*.jpg
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# UBC Beaty Biodiversity Museum - ML Project Demo

A proof of concept project to utilize machine learning techniques for handling, organizing, and presenting large quantities of herbarium data.

## Data Download
To download images from the beaty collection:
1. Clone the repo
2. `cd` into the repo
3. Run `git checkout ethan-dev`
4. Make sure you are in the venv or conda env you want to work with
5. Run `pip install -e .`
- this will install the repo as a package, the -e flag tells it that you are editing the package so that when you make changes, you don't have to reinstall the package each time
6. Run `cd data`
7. Run `python download.py`
- If you want to change the number of images you want to download, you have to modify the 'num_samples' parameter in the main function
Empty file added beatylm2/data/__init__.py
Empty file.
77 changes: 77 additions & 0 deletions beatylm2/data/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from beatylm2.utils.data_utils import get_image_list
from pathlib import Path
import requests
from typing import List, Union
import os
import time
import random
from tqdm import tqdm

def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs):
'''
Download images from the Beaty Biodiversity Museum collection.
args:
- dst_path (Union[Path, str]): The destination directory to save the images.
- num_samples (int): The number of images to download.
- **kwargs: Additional keyword arguments to pass to the get_image_list function. see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
'''

# ensure the destination directory exists
dst_path = Path(dst_path)
os.makedirs(dst_path, exist_ok=True)

# download images
print(f"Downloading {num_samples} images to '{os.path.abspath(dst_path)}'...")
print(f"\tNote* there are only ~35,000 images in the database")
print(f"\tNote* that many images are not available which may slow down the download process")

files_downloaded = []
num_downloaded = 0
num_tries = 0 # number of images tried to download, can be used to set begining offset for future downloads
offset = kwargs.get("offset", 0)

while num_downloaded < num_samples:
# get list of images to download
image_files = get_image_list(**kwargs)
if not image_files:
print("No more image files to download.")
break

# download images in the list
with tqdm(total=num_samples, desc="Downloading images", initial=num_downloaded, unit="image") as pbar:
for image_file in image_files:

url = f"https://beaty.b-cdn.net/{image_file}"
try:
response = requests.get(url, stream=True)
except Exception as e:
print(f"An error occurred while downloading {url}: {e}")
return len(files_downloaded), files_downloaded, num_tries

num_tries += 1

if response.status_code == 200:
with open(dst_path / image_file, "wb") as f:
f.write(response.content)

files_downloaded.append(image_file)
num_downloaded += 1
pbar.update(1)

if num_downloaded >= num_samples:
break

# add a random delay to avoid overloading the server
time.sleep(random.uniform(0.1, 1))

offset += len(image_files)
kwargs["offset"] = offset

return len(files_downloaded), files_downloaded, num_tries

if __name__ == "__main__":
num, files, num_tries = download_images("data/images", num_samples=50, limit=200)
print(f"Downloaded: {num} images")
print(f"Files: {files}")
print(f"final offset: {num_tries}")

Empty file added beatylm2/utils/__init__.py
Empty file.
25 changes: 25 additions & 0 deletions beatylm2/utils/data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pygbif import occurrences as occ

def get_image_list(**kwargs):
''' For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
'''

# default search arguments
search_args = {
"datasetKey": "07fd0d79-4883-435f-bba1-58fef110cd13",
"mediaType": "StillImage",
"limit": 1000,
}

# override default search arguments with user-provided arguments
search_args.update(kwargs)

# search for occurrences
query = occ.search(**search_args)

# extract the list of image files
list_of_files = []
for record in query["results"]:
list_of_files.append(record["catalogNumber"] + ".jpg")

return list_of_files
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# python_version == "python==3.10.14"
pygbif==0.6.4
tqdm==4.66.4
13 changes: 13 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from setuptools import setup, find_packages

setup(
name="beatylm2",
version="0.1",
packages=find_packages(),
install_requires=[
"pygbif==0.6.4",
"tqdm==4.66.4",
# add other dependencies here
],
python_requires="==3.10.*",
)