Skip to content

Commit

Permalink
Update get_wiki_image to work better with extinct
Browse files Browse the repository at this point in the history
  • Loading branch information
davidebbo committed Nov 14, 2024
1 parent c391db9 commit 0f78463
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 10 deletions.
88 changes: 79 additions & 9 deletions oz_tree_build/images_and_vernaculars/get_wiki_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,14 @@ def save_wiki_image(db, leaf_data, image_name, src, src_id, rating, output_dir,
s = placeholder(db)

ott = leaf_data["ott"]
if not ott:
logger.warning(f"No OTT for Q{src_id}. Can't save {image_name}")
return False

# Wikimedia uses underscores instead of spaces in URLs
escaped_image_name = image_name.replace(" ", "_")
escaped_image_name = image_name.replace(" ", "_").replace("&", "%26").replace("+", "%2B")
# Also escape the ampersand and plus signs in the image name
escaped_image_name = escaped_image_name.replace("&", "%26").replace("+", "%2B")
image_dir = os.path.normpath(os.path.join(output_dir, str(src), subdir_name(src_id)))
image_path = f"{image_dir}/{src_id}.jpg"

Expand All @@ -298,7 +303,7 @@ def save_wiki_image(db, leaf_data, image_name, src, src_id, rating, output_dir,
existing_image_name = leaf_data["img"][len(wiki_image_url_prefix) :]
if existing_image_name == escaped_image_name:
if os.path.isfile(image_path):
logger.info(f"Image '{image_name}' for {ott} is in the db, and at {image_path}")
logger.debug(f"Image '{image_name}' for {ott} is in the db, and at {image_path}")
return True
else:
logger.warning(f"{image_name} for {ott} is in the db, but the " f"file is missing, so re-processing")
Expand Down Expand Up @@ -407,6 +412,7 @@ def save_wiki_image(db, leaf_data, image_name, src, src_id, rating, output_dir,
datetime.datetime.now(),
),
)
db.commit()

# Since we added a new image, we need to update all the image bits for that ott
process_image_bits.resolve(db, ott)
Expand Down Expand Up @@ -463,6 +469,7 @@ def process_leaf(
db,
ott_or_taxon,
image_name=None,
taxa_data=None,
rating=None,
skip_images=None,
output_dir=None,
Expand Down Expand Up @@ -492,6 +499,10 @@ def process_leaf(
(ott, qid, name) = result[0]
logger.info(f"Processing '{name}' (ott={ott}, qid={qid})")

# If we didn't get a qid from the database, try to get it from the taxa data
if qid is None:
qid = get_qid_from_taxa_data(taxa_data, name)

# Three cases for the rating:
# - If it's passed in, use it
# - If it's not passed in for a bespoke image, use 40000
Expand All @@ -511,7 +522,12 @@ def process_leaf(
# Get the highest bespoke src_id, and add 1 to it for the new image src_id
src_id = get_next_src_id_for_src(db, src)
else:
image = get_preferred_or_first_image_from_json_item(json_item)
# If the data file has an image for this taxon, use it
image_name = get_image_from_taxa_data(taxa_data, name)
if image_name:
image = {"name": image_name}
else:
image = get_preferred_or_first_image_from_json_item(json_item)
src = src_flags["wiki"]
src_id = qid
if image:
Expand All @@ -522,7 +538,32 @@ def process_leaf(
save_wiki_vernaculars_for_qid(db, ott, qid, vernaculars_by_language)


def process_clade(db, ott_or_taxon, dump_file, skip_images, output_dir, cropper=None):
def get_prop_from_taxa_data(taxa_data, taxon, prop):
"""
Get a property for a taxon from the taxa data dictionary.
"""
if taxa_data is None:
return None
if taxon in taxa_data:
data = taxa_data[taxon]
if not data:
return None
if "redirect" in data:
data = taxa_data[data["redirect"]]
if prop in data:
return data[prop]
return None


def get_image_from_taxa_data(taxa_data, taxon):
return get_prop_from_taxa_data(taxa_data, taxon, "image")


def get_qid_from_taxa_data(taxa_data, taxon):
return get_prop_from_taxa_data(taxa_data, taxon, "qid")


def process_clade(db, ott_or_taxon, dump_file, taxa_data, skip_images, output_dir, cropper=None):
"""
`crop` can be an ImageAnalysisClient, a crop location in the image
(x, y, width, height), or None to carry out a default (centered) crop.
Expand Down Expand Up @@ -552,7 +593,24 @@ def process_clade(db, ott_or_taxon, dump_file, skip_images, output_dir, cropper=
WHERE ordered_leaves.id >= {s} AND ordered_leaves.id <= {s};
"""
rows = db.executesql(sql, (src_flags["wiki"], leaf_lft, leaf_rgt))
leaves_data = {qid: {"ott": ott, "taxon": name, "img": url} for qid, ott, name, url in rows}

# If some rows don't have a qid, try to get that from the taxa data
# If all else fails, skip that row.
fixed_rows = []
for row in rows:
# Skip rows with no ott
if row[1] is None:
continue
qid = row[0]
if not qid:
qid = get_qid_from_taxa_data(taxa_data, row[2])
row = (qid, row[1], row[2], row[3])
if not qid:
logger.warning(f"No qid for {row[2]}. Skipping it.")
continue
fixed_rows.append(row)

leaves_data = {qid: {"ott": ott, "taxon": name, "img": url} for qid, ott, name, url in fixed_rows}
logger.info(f"Found {len(leaves_data)} leaves in the database")

# Get leaves in the clade with no wiki vernaculars, ignoring verns from other sources
Expand All @@ -568,8 +626,10 @@ def process_clade(db, ott_or_taxon, dump_file, skip_images, output_dir, cropper=
leaves_that_got_images = set()
for qid, image, vernaculars in enumerate_wiki_dump_items(dump_file):
if not skip_images and qid in leaves_data:
image_name = None
if image:
# If the data file has an image for this taxon, use it
image_name = get_image_from_taxa_data(taxa_data, leaves_data[qid]["taxon"])
if not image_name and image:
# Fall back to the image from the dump
image_name = image["name"]
if image_name and save_wiki_image(
db, leaves_data[qid], image_name, src_flags["wiki"], qid, default_wiki_image_rating, output_dir, cropper
Expand Down Expand Up @@ -611,16 +671,21 @@ def process_args(args):
db = connect_to_database(database)
cropper = AzureImageCropper(config)

taxa_data = {}
if args.taxa_data_file:
with open(args.taxa_data_file) as f:
taxa_data = json.load(f)

if args.subcommand == "leaf":
# Process one leaf at a time
if len(args.ott_or_taxa) > 1 and args.image is not None:
raise ValueError("Cannot specify multiple taxa when using a bespoke image")
for name in args.ott_or_taxa:
process_leaf(db, name, args.image, args.rating, args.skip_images, outdir, cropper)
process_leaf(db, name, args.image, taxa_data, args.rating, args.skip_images, outdir, cropper)
elif args.subcommand == "clade":
# Process all the taxa in the passed in clades
for name in args.ott_or_taxa:
process_clade(db, name, args.wd_dump, args.skip_images, outdir, cropper)
process_clade(db, name, args.wd_dump, taxa_data, args.skip_images, outdir, cropper)


def setup_logging(args):
Expand Down Expand Up @@ -665,6 +730,11 @@ def add_common_args(parser):
action="store_true",
help="Only process vernaculars, not images",
)
parser.add_argument(
"--taxa-data-file",
default=None,
help="JSON file with persisted data about various taxa",
)
parser.add_argument(
"-o",
"--output-dir",
Expand Down
1 change: 1 addition & 0 deletions tests/test_full_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def test_full_felidae_generation():
args.wikilang = "en"
args.popularity_file = ""
args.extra_source_file = None
args.taxa_data_file = None
args.exclude = []
args.info_on_focal_labels = []

Expand Down
3 changes: 2 additions & 1 deletion tests/test_get_wiki_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def wikimedia_response(self, image_name, licence="cc0", artist="John Doe"):
url = (
"https://api.wikimedia.org/w/api.php"
f"?action=query&titles=File%3a{image_name}&format=json&prop=imageinfo"
"&iiprop=extmetadata&iiextmetadatafilter=License|LicenseUrl|Artist"
"&iiprop=extmetadata&iiextmetadatafilter=License|LicenseShortName|LicenseUrl|Artist"
)
response = {
"query": {
Expand Down Expand Up @@ -222,6 +222,7 @@ def get_command_arguments(subcommand, ott_or_taxa, image, rating, output_dir, co
skip_images=None,
output_dir=output_dir,
conf_file=conf_file,
taxa_data_file=None,
)


Expand Down

0 comments on commit 0f78463

Please sign in to comment.