From 9369b56c446adbdcd06527cae6f5d6d96e484a6c Mon Sep 17 00:00:00 2001
From: Ezepheros <ethanzhao9876@gmail.com>
Date: Thu, 13 Jun 2024 09:52:55 +0800
Subject: [PATCH 1/9] add dir

---
 src/data/download.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 src/data/download.py

diff --git a/src/data/download.py b/src/data/download.py
new file mode 100644
index 0000000..e69de29

From b36a7fdfc41c185ffc9454ef6447a3e45e25faa3 Mon Sep 17 00:00:00 2001
From: Ezepheros <ethanzhao9876@gmail.com>
Date: Thu, 13 Jun 2024 13:09:33 +0800
Subject: [PATCH 2/9] add download files

---
 .gitignore                                    |  5 +
 .../download.py => beatylm2/data/__init__.py  |  0
 beatylm2/data/download.py                     | 96 +++++++++++++++++++
 beatylm2/utils/__init__.py                    |  0
 beatylm2/utils/data_utils.py                  | 24 +++++
 requirements.txt                              |  3 +
 setup.py                                      | 13 +++
 7 files changed, 141 insertions(+)
 rename src/data/download.py => beatylm2/data/__init__.py (100%)
 create mode 100644 beatylm2/data/download.py
 create mode 100644 beatylm2/utils/__init__.py
 create mode 100644 beatylm2/utils/data_utils.py
 create mode 100644 requirements.txt
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
index 82f9275..8e6c66e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,3 +160,8 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# additional
+
+# data
+*.jpg
\ No newline at end of file
diff --git a/src/data/download.py b/beatylm2/data/__init__.py
similarity index 100%
rename from src/data/download.py
rename to beatylm2/data/__init__.py
diff --git a/beatylm2/data/download.py b/beatylm2/data/download.py
new file mode 100644
index 0000000..ecb0548
--- /dev/null
+++ b/beatylm2/data/download.py
@@ -0,0 +1,96 @@
+from beatylm2.utils.data_utils import get_image_list
+from pathlib import Path
+import requests
+from typing import List, Union
+import os
+import time
+import random
+from tqdm import tqdm
+
+''' Definition of get_image_list from bbl-ml-demo/beatylm2/utils/data_utils.py:
+from pygbif import occurrences as occ
+
+def get_image_list(**kwargs):
+    # For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
+    
+    # default search arguments
+    search_args = {
+        "datasetKey": "07fd0d79-4883-435f-bba1-58fef110cd13",
+        "mediaType": "StillImage",
+        "limit": 1000,
+    }
+
+    # override default search arguments with user-provided arguments
+    search_args.update(kwargs)
+
+    # search for occurrences
+    query = occ.search(**search_args)
+
+    # extract the list of image files
+    list_of_files = []
+    for record in query["results"]:
+        list_of_files.append(record["catalogNumber"] + ".jpg")
+
+    return list_of_files'''
+
+def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs):
+
+    # ensure the destination directory exists
+    dst_path = Path(dst_path)
+    os.makedirs(dst_path, exist_ok=True)
+
+    # download images
+    print(f"Downloading {num_samples} images to '{os.path.abspath(dst_path)}'...")
+    print(f"\tNote* there are only ~35,000 images in the database")
+    print(f"\tNote* that many images are not available which may slow down the download process")
+
+    files_downloaded = []
+    num_downloaded = 0
+    num_tries = 0 # number of images tried to download, can be used to set begining offset for future downloads
+    offset = kwargs.get("offset", 0)
+
+    while num_downloaded < num_samples:
+        # get list of images to download
+        image_files = get_image_list(**kwargs)
+        if not image_files:
+            print("No more image files to download.")
+            break
+
+        # download images in the list
+        with tqdm(total=num_samples, desc="Downloading images", initial=num_downloaded, unit="image") as pbar:
+            for image_file in image_files:
+                
+                url = f"https://beaty.b-cdn.net/{image_file}"
+                try:
+                    response = requests.get(url, stream=True)
+                except Exception as e:
+                    print(f"An error occurred while downloading {url}: {e}")
+                    return len(files_downloaded), files_downloaded, num_tries
+
+                num_tries += 1
+
+                if response.status_code == 200:
+                    with open(dst_path / image_file, "wb") as f:
+                        f.write(response.content)
+
+                    files_downloaded.append(image_file)
+                    num_downloaded += 1
+                    pbar.update(1)
+
+                    if num_downloaded >= num_samples:
+                        break
+
+                # add a random delay to avoid overloading the server
+                time.sleep(random.uniform(0.1, 1))
+
+            offset += len(image_files)
+            kwargs["offset"] = offset
+        
+    return len(files_downloaded), files_downloaded, num_tries
+
+if __name__ == "__main__":
+    num, files, num_tries = download_images("data/images", num_samples=1000, limit=200)
+    print(f"Downloaded: {num} images")
+    print(f"Files: {files}")
+    print(f"final offset: {num_tries}")
+    
\ No newline at end of file
diff --git a/beatylm2/utils/__init__.py b/beatylm2/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/beatylm2/utils/data_utils.py b/beatylm2/utils/data_utils.py
new file mode 100644
index 0000000..accb20f
--- /dev/null
+++ b/beatylm2/utils/data_utils.py
@@ -0,0 +1,24 @@
+from pygbif import occurrences as occ
+
+def get_image_list(**kwargs):
+    # For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
+    
+    # default search arguments
+    search_args = {
+        "datasetKey": "07fd0d79-4883-435f-bba1-58fef110cd13",
+        "mediaType": "StillImage",
+        "limit": 1000,
+    }
+
+    # override default search arguments with user-provided arguments
+    search_args.update(kwargs)
+
+    # search for occurrences
+    query = occ.search(**search_args)
+
+    # extract the list of image files
+    list_of_files = []
+    for record in query["results"]:
+        list_of_files.append(record["catalogNumber"] + ".jpg")
+
+    return list_of_files
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..4c8ee97
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+# python_version == "python==3.10.14"
+pygbif==0.6.4
+tqdm==4.66.4
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..c94df86
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,13 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="beatylm2",
+    version="0.1",
+    packages=find_packages(),
+    install_requires=[
+        "pygbif==0.6.4",
+        "tqdm==4.66.4",
+        # add other dependencies here
+    ],
+    python_requires="==3.10.*",
+)

From daffa328086c5338fdbefc3100b2e41d8a352030 Mon Sep 17 00:00:00 2001
From: Ethan Zhao <75817837+Ezepheros@users.noreply.github.com>
Date: Sat, 13 Jul 2024 12:40:54 +0800
Subject: [PATCH 3/9] Update README.md for downloading data

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 021be9b..246559b 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,15 @@
 # UBC Beaty Biodiversity Museum - ML Project Demo
 
 A proof of concept project to utilize machine learning techniques for handling, organizing, and presenting large quantities of herbarium data.
+
+## Data Download
+To download images from the beaty collection:
+1. Clone the repo
+2. Run `git checkout ethan-dev`
+3. `cd` into the repo
+4. Make sure you are in the venv or conda env you want to work with
+5. Run `pip install -e .`
+    - this will install the repo as a package, the -e flag tells it that you are editing the package so that when you make changes, you don't have to reinstall the package each time
+6. Run `cd data`
+7. Run `python download.py`
+    - If you want to change the number of images you want to download, you have to modify the 'num_samples' parameter in the main function

From 588cc429fc1075e7b9db116a184e5de7ad09d219 Mon Sep 17 00:00:00 2001
From: Ethan Zhao <75817837+Ezepheros@users.noreply.github.com>
Date: Sat, 13 Jul 2024 12:41:46 +0800
Subject: [PATCH 4/9] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 246559b..3bbbd9c 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,8 @@ A proof of concept project to utilize machine learning techniques for handling,
 ## Data Download
 To download images from the beaty collection:
 1. Clone the repo
-2. Run `git checkout ethan-dev`
-3. `cd` into the repo
+2. `cd` into the repo
+3. Run `git checkout ethan-dev`
 4. Make sure you are in the venv or conda env you want to work with
 5. Run `pip install -e .`
     - this will install the repo as a package, the -e flag tells it that you are editing the package so that when you make changes, you don't have to reinstall the package each time

From a9b516098db0525d20d12cdaecf89b5fe8423823 Mon Sep 17 00:00:00 2001
From: Ethan Zhao <75817837+Ezepheros@users.noreply.github.com>
Date: Sat, 13 Jul 2024 12:43:23 +0800
Subject: [PATCH 5/9] Update README.md


From a299693f62bcab30bf5ccfea7a67cd82c4955b3c Mon Sep 17 00:00:00 2001
From: Ezepheros <ethanzhao9876@gmail.com>
Date: Sat, 13 Jul 2024 12:46:23 +0800
Subject: [PATCH 6/9] change default samples downloaded

---
 beatylm2/data/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beatylm2/data/download.py b/beatylm2/data/download.py
index ecb0548..634eacc 100644
--- a/beatylm2/data/download.py
+++ b/beatylm2/data/download.py
@@ -89,7 +89,7 @@ def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs):
     return len(files_downloaded), files_downloaded, num_tries
 
 if __name__ == "__main__":
-    num, files, num_tries = download_images("data/images", num_samples=1000, limit=200)
+    num, files, num_tries = download_images("data/images", num_samples=50, limit=200)
     print(f"Downloaded: {num} images")
     print(f"Files: {files}")
     print(f"final offset: {num_tries}")

From ca9a59467a97da24b62f4c20b45e62dc6c1be78a Mon Sep 17 00:00:00 2001
From: Ezepheros <ethanzhao9876@gmail.com>
Date: Sat, 13 Jul 2024 12:48:40 +0800
Subject: [PATCH 7/9] add doc string

---
 beatylm2/utils/data_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/beatylm2/utils/data_utils.py b/beatylm2/utils/data_utils.py
index accb20f..d64ba3c 100644
--- a/beatylm2/utils/data_utils.py
+++ b/beatylm2/utils/data_utils.py
@@ -1,7 +1,8 @@
 from pygbif import occurrences as occ
 
 def get_image_list(**kwargs):
-    # For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
+    ''' For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
+    '''
     
     # default search arguments
     search_args = {

From e3ea19be68661dc16944e34a834a3d6d31650660 Mon Sep 17 00:00:00 2001
From: Ezepheros <ethanzhao9876@gmail.com>
Date: Sat, 13 Jul 2024 12:50:41 +0800
Subject: [PATCH 8/9] add docstring

---
 beatylm2/data/download.py | 33 +++++++--------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/beatylm2/data/download.py b/beatylm2/data/download.py
index 634eacc..d5327f5 100644
--- a/beatylm2/data/download.py
+++ b/beatylm2/data/download.py
@@ -7,33 +7,14 @@
 import random
 from tqdm import tqdm
 
-''' Definition of get_image_list from bbl-ml-demo/beatylm2/utils/data_utils.py:
-from pygbif import occurrences as occ
-
-def get_image_list(**kwargs):
-    # For arguments see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
-    
-    # default search arguments
-    search_args = {
-        "datasetKey": "07fd0d79-4883-435f-bba1-58fef110cd13",
-        "mediaType": "StillImage",
-        "limit": 1000,
-    }
-
-    # override default search arguments with user-provided arguments
-    search_args.update(kwargs)
-
-    # search for occurrences
-    query = occ.search(**search_args)
-
-    # extract the list of image files
-    list_of_files = []
-    for record in query["results"]:
-        list_of_files.append(record["catalogNumber"] + ".jpg")
-
-    return list_of_files'''
-
 def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs):
+    '''
+    Download images from the Beaty Biodiversity Museum collection.
+    Args:
+        dst_path (Union[Path, str]): The destination directory to save the images.
+        num_samples (int): The number of images to download.
+        **kwargs: Additional keyword arguments to pass to the get_image_list function. see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
+    '''
 
     # ensure the destination directory exists
     dst_path = Path(dst_path)

From 1d5a83da59d45c3822f43fae65ceddde49af3e7b Mon Sep 17 00:00:00 2001
From: Ezepheros <ethanzhao9876@gmail.com>
Date: Sat, 13 Jul 2024 12:56:28 +0800
Subject: [PATCH 9/9] add docstring

---
 beatylm2/data/download.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/beatylm2/data/download.py b/beatylm2/data/download.py
index d5327f5..4ca44c9 100644
--- a/beatylm2/data/download.py
+++ b/beatylm2/data/download.py
@@ -10,10 +10,10 @@
 def download_images(dst_path: Union[Path, str], num_samples: int, **kwargs):
     '''
     Download images from the Beaty Biodiversity Museum collection.
-    Args:
-        dst_path (Union[Path, str]): The destination directory to save the images.
-        num_samples (int): The number of images to download.
-        **kwargs: Additional keyword arguments to pass to the get_image_list function. see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
+    args:
+        - dst_path (Union[Path, str]): The destination directory to save the images.
+        - num_samples (int): The number of images to download.
+        - **kwargs: Additional keyword arguments to pass to the get_image_list function. see: https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/
     '''
 
     # ensure the destination directory exists