Skip to content

Commit

Permalink
Merge pull request #94 from OneZoom/wd_ott_stats
Browse files Browse the repository at this point in the history
Add stats about the otts coming from wikidata
  • Loading branch information
davidebbo authored Jul 12, 2024
2 parents 8b13221 + 5132731 commit 837f9f3
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,35 @@ def output_simplified_tree(tree, taxonomy_file, outdir, version, seed, save_sql=
)


def display_WD_ott_stats(OTT_ptrs):
"""
Display some stats about OTTs coming from Wikidata
"""
matching_otts = 0
mismatching_otts = 0
no_wd_otts = 0
for ott in OTT_ptrs:
try:
if OTT_ptrs[ott]["rank"] == "species":
if OTT_ptrs[ott]["wd"].get("wd_ott") is not None:
if ott == OTT_ptrs[ott]["wd"].wd_ott:
matching_otts += 1
else:
logging.debug(
f"Q{OTT_ptrs[ott]['wd'].Q}: OTT {ott} does not match {OTT_ptrs[ott]['wd'].wd_ott}"
)
mismatching_otts += 1
else:
no_wd_otts += 1
except (KeyError, AttributeError):
pass

logging.info("✔ Stats on Wikidata OTT matching:")
logging.info(f" Leaves where the WD ott matches the ott: {matching_otts}")
logging.info(f" Leaves where the WD ott does not match the wd_ott: {mismatching_otts}")
logging.info(f" Leaves where WD does not have an ott: {no_wd_otts}")


def map_wiki_info(
source_ptrs,
source_order,
Expand Down Expand Up @@ -768,6 +797,8 @@ def map_wiki_info(

logging.info("✔ Wikidata/wikipedia data mapped")

display_WD_ott_stats(OTT_ptrs)

return popularity_steps == 2


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,12 @@ def wikidata_info(
f" Cannot convert IPNI property {ipni} to integer"
f" in Q{item_instance.Q} ({label(json_item)})."
)
try:
# Save the ott coming from the wikidata item, if any
item_instance.wd_ott = int(claims["P9157"][0]["mainsnak"]["datavalue"]["value"])
except (KeyError, ValueError):
pass

# Only map alternate Qs if current item has a main language link
# Otherwise, it's likely not an interesting item to map to, and it may
# end up overridding a better existing mapping
Expand Down
2 changes: 2 additions & 0 deletions oz_tree_build/utilities/generate_filtered_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ def generate_filtered_wikidata_dump(wikipedia_dump_file, filtered_wikipedia_dump
"P5055": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # irmng id
"P830": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # EOL id
"P961": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # IPNI id
"P9157": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # OTT id
"P3151": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # iNaturalist id
"P141": [{"references": [{"snaks": {"P627": [{"datavalue": {"value": KEEP}}]}}]}], # IUCN id
"P1420": [{"mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}}], # taxon synonym
"P18": [
Expand Down

0 comments on commit 837f9f3

Please sign in to comment.