Skip to content

Commit

Permalink
code for downloading GBIF metadata and images
Browse files Browse the repository at this point in the history
code for downloading GBIF metadata and images
  • Loading branch information
miguelcleon committed Aug 14, 2024
1 parent 9a3d828 commit 69c8799
Show file tree
Hide file tree
Showing 3 changed files with 320 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ po/*~

# RStudio Connect folder
rsconnect/
credentials.json
211 changes: 211 additions & 0 deletions downloadGBIFImagesLocal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
from pygbif import species
import os
import logging
import json
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_exponential
import mimetypes

# List of species names
species_names = [
"Calypte anna",
"Selasphorus sasin",
"Selasphorus rufus",
"Archilochus alexandri",
"Calypte costae",
"Selasphorus calliope",
"Selasphorus platycercus",
"Archilochus colubris",
"Amazilia violiceps",
"Cynanthus latirostris",
"Eugenes fulgens"
]

# Define the local folder paths
source_folder_path = './metadata' # Replace with the path of the folder containing gbifmultimedia files
target_folder_path = './images' # Replace with the path of the target folder for images and metadata

# Function to get taxon key
def get_taxon_key(name):
result = species.name_backbone(name=name)
return result.get('usageKey')

# Get taxon keys for all species
taxon_keys = {get_taxon_key(name): name for name in species_names}
print(taxon_keys)

# Configure logging
def configure_logging(species_folder):
log_file = os.path.join(species_folder, 'failures.log')
logging.basicConfig(
level=logging.ERROR,
format='%(asctime)s %(levelname)s %(message)s',
handlers=[
logging.FileHandler(log_file, mode='a'),
logging.StreamHandler()
]
)
return log_file

@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
def download_image(url, timeout=60):
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status() # Raise HTTPError for bad responses (4xx and 5xx)
return response.content
except requests.exceptions.RequestException as e:
logging.error(f"Error downloading image {url}: {e}")
raise

# Function to save a file locally
def save_file_locally(file_name, file_content, folder_path):
try:
file_path = os.path.join(folder_path, file_name)
with open(file_path, 'wb') as f:
f.write(file_content)
return file_path
except Exception as e:
logging.error(f"Error saving file {file_name}: {e}")
raise

# Function to check if a file exists locally
def file_exists_locally(file_name, folder_path):
try:
file_path = os.path.join(folder_path, file_name)
return os.path.exists(file_path)
except Exception as e:
logging.error(f"Error checking if file exists {file_name}: {e}")
raise

# Function to create a subfolder locally if it doesn't exist
def get_or_create_local_folder(folder_name, parent_folder_path):
try:
folder_path = os.path.join(parent_folder_path, folder_name)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return folder_path
except Exception as e:
logging.error(f"Error creating folder {folder_name}: {e}")
raise

# Function to download an image and create metadata
def download_and_create_metadata(row, subfolder_path, download_counts):
try:
media = json.loads(row['media'].replace("'", '"')) if isinstance(row['media'], str) else []
gbif_id = row['gbifID']

image_urls = [item['identifier'] for item in media if item.get('type') == 'StillImage']

for idx, image_url in enumerate(image_urls, start=1):
image_name = image_url.split('/')[-1]
image_name_with_id = f"{os.path.splitext(image_name)[0]}_{gbif_id}_{idx}{os.path.splitext(image_name)[1]}"

if not file_exists_locally(image_name_with_id, subfolder_path):
try:
# Download the image with retry logic
image_content = download_image(image_url)
download_counts['success'] += 1

# Save the image locally
mime_type, _ = mimetypes.guess_type(image_name_with_id)
if mime_type is None:
mime_type = 'application/octet-stream' # Fallback MIME type
save_file_locally(image_name_with_id, image_content, subfolder_path)

# Create metadata content
metadata_content = '\n'.join([f"{key}: {value}" for key, value in row.items()])
metadata_file_name = f"{os.path.splitext(image_name_with_id)[0]}.txt"
save_file_locally(metadata_file_name, metadata_content.encode('utf-8'), subfolder_path)
except Exception as e:
download_counts['fail'] += 1
logging.error(f"Failed to download image {image_url} for gbifID {gbif_id}: {e}")
else:
print(f"File already exists: {image_name_with_id}")
except Exception as e:
download_counts['fail'] += 1
logging.error(f"Failed to process record with gbifID {row.get('gbifID')}: {e}")

# Function to download a metadata file locally
def download_metadata_file(file_path):
try:
return pd.read_csv(file_path, delimiter='\t', low_memory=False)
except Exception as e:
logging.error(f"Failed to download metadata from {file_path}: {e}")
raise

# List gbifmultimedia files in the source local folder
def list_local_files(folder_path):
try:
files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
print(f"Files found in folder {folder_path}: {files}")
return files
except Exception as e:
logging.error(f"Failed to list files in folder {folder_path}: {e}")
raise

local_files = list_local_files(source_folder_path)
print(f"Local files: {local_files}") # Debug statement

# Process each gbifmultimedia file
for local_file in local_files:
file_path = os.path.join(source_folder_path, local_file)

# Extract the suffix from the file name
suffix = os.path.splitext(local_file)[0].split('_')[-1]

# Get or create a subfolder in the target local folder
subfolder_name = f"subfolder_{suffix}"
subfolder_path = get_or_create_local_folder(subfolder_name, target_folder_path)

# Determine the species folder path for logging
species_folder = os.path.join('./logs', suffix)

# Ensure the species folder exists
if not os.path.exists(species_folder):
os.makedirs(species_folder)

# Configure logging for this species
configure_logging(species_folder)

try:
# Download metadata from local folder
df = download_metadata_file(file_path)
except Exception as e:
print(f"Skipping file {local_file} due to download error: {e}")
continue

# Print the column names for debugging
print(f"Columns in {local_file}: {df.columns.tolist()}")

# Check if the 'media' column exists
if 'media' not in df.columns:
print(f"'media' column not found in {local_file}, skipping.")
continue

# Initialize counters for downloads
download_counts = {'success': 0, 'fail': 0}

try:
# Use ThreadPoolExecutor to download images and create metadata files concurrently
with ThreadPoolExecutor(max_workers=1) as executor: # Reduce number of threads to avoid concurrency issues
futures = [executor.submit(download_and_create_metadata, row, subfolder_path, download_counts) for _, row in df.iterrows()]

for future in as_completed(futures):
try:
future.result()
except Exception as e:
logging.error(f"Error occurred during processing: {e}")

except Exception as e:
print(f"Skipping file {local_file} due to processing error: {e}")
continue

# Log the counts at the end
log_file = os.path.join(species_folder, 'failures.log')
with open(log_file, 'a') as log:
log.write(f"\nTotal successful downloads: {download_counts['success']}\n")
log.write(f"Total failed downloads: {download_counts['fail']}\n")

print("Data downloaded and organized by species.")
108 changes: 108 additions & 0 deletions downloadgbifmetadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import os
import json
import pandas as pd
from pygbif import occurrences as occ
import time

# Load credentials from the JSON file
with open('credentials.json', 'r') as file:
credentials = json.load(file)

USERNAME = credentials['USERNAME']
PASSWORD = credentials['PASSWORD']
EMAIL = credentials['EMAIL']

from pygbif import species

# List of species names
species_names = [
"Selasphorus sasin",
"Selasphorus rufus",
"Archilochus alexandri",
"Calypte costae",
"Selasphorus calliope",
"Selasphorus platycercus",
"Archilochus colubris",
"Amazilia violiceps",
"Cynanthus latirostris",
"Eugenes fulgens"
"Anthracothorax dominicus",
"Orthorhyncus cristatus",
"Anthracothorax viridis",
"Eulampis holosericeus",
"Riccordia maugaeus"
]

# Function to get taxon key
def get_taxon_key(name):
result = species.name_backbone(name=name)
return result.get('usageKey')

# Get taxon keys for all species
taxon_keys = {get_taxon_key(name): name for name in species_names}
print(taxon_keys)
taxon_keys = {
2476675: "Selasphorus sasin",
2476676: "Selasphorus rufus",
5228513: "Archilochus alexandri",
2476675: "Calypte costae",
7597244: "Selasphorus calliope",
2476680: "Selasphorus platycercus",
5228514: "Archilochus colubris",
2476462: "Amazilia violiceps",
5228542: "Cynanthus latirostris",
2476108: "Eugenes fulgens",
2476715: "Anthracothorax dominicus",
2476284: "Orthorhyncus cristatus",
2476728: "Anthracothorax viridis",
2476399: "Eulampis holosericeus",
11091395: "Riccordia maugaeus"
}

# Function to save dataframe to CSV with retries
def save_to_csv_with_retries(df, filename, max_retries=5):
for attempt in range(max_retries):
try:
df.to_csv(filename, index=False, sep='\t')
return
except OSError as e:
print(f"Error saving {filename}: {e}")
if attempt < max_retries - 1:
print("Retrying...")
time.sleep(2 ** attempt) # Exponential backoff
else:
print(f"Failed to save {filename} after {max_retries} attempts")

# Function to download GBIF metadata
def download_gbif_metadata():
for taxon_key, species_name in taxon_keys.items():
all_records = []
offset = 0
limit = 300
while True:
response = occ.search(
taxonKey=taxon_key,
country='US',
hasCoordinate=True,
hasGeospatialIssue=False,
mediatype='StillImage',
limit=limit,
offset=offset
)

results = response.get('results', [])
all_records.extend(results)

if len(results) < limit:
break

offset += limit

df = pd.DataFrame(all_records)
filename = f"./metadata/gbifmultimedia_{taxon_key}.txt"
save_to_csv_with_retries(df, filename)
print(f"Downloaded metadata for {species_name} with taxon_key {taxon_key}")

# Download the metadata and save to files
download_gbif_metadata()
print("Downloaded metadata for all specified species.")

0 comments on commit 69c8799

Please sign in to comment.