Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: move iNat tools from ca-tools #18

Merged
merged 2 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/synonyms.csv
Original file line number Diff line number Diff line change
Expand Up @@ -1515,7 +1515,7 @@ Phlox gracilis,Microsteris gracilis
Phoradendron macrophyllum,Phoradendron leucarpum subsp. macrophyllum,INAT
Phoradendron serotinum subsp. macrophyllum,Phoradendron leucarpum subsp. macrophyllum
Phoradendron serotinum subsp. tomentosum,Phoradendron leucarpum subsp. tomentosum
Phoradendron villosum,Phoradendron leucarpum subsp. tomentosum
Phoradendron villosum,Phoradendron leucarpum subsp. tomentosum,INAT
Phragmites australis var. berlandieri,Phragmites australis
Phragmites berlandieri,Phragmites australis
Phragmites communis var. berlandieri,Phragmites australis
Expand Down
6 changes: 3 additions & 3 deletions data/taxa.csv
Original file line number Diff line number Diff line change
Expand Up @@ -971,7 +971,7 @@ Juncus phaeocephalus var. phaeocephalus,brown-headed rush,N,60393,4491,61036,Bro
Juncus tenuis,poverty rush,N,29725,4496,69930,Poverty Rush
Juncus xiphioides,iris-leaved rush,N,29743,4502,57110,Irisleaf Rush
Juniperus californica,California juniper,N,29749,4503,57889,California Juniper
Juniperus communis var. saxatilis,mountain juniper,N,60424,10933,,Common Juniper
Juniperus communis var. saxatilis,mountain juniper,N,60424,10933,81001,Common Juniper
Keckiella breviflora var. breviflora,,N,75718,4520,59015,Gaping Keckiella
Keckiella corymbosa,,N,29883,4523,62043,Keckiella
Kickxia elatine,fluellin,X,29898,4532,64332
Expand Down Expand Up @@ -1360,7 +1360,7 @@ Phoenix canariensis,Canary Island date palm,X,37878,6449,78554
Pholistoma auritum var. auritum,fiesta flower,N,63599,6455,58926,Fiesta Flower
Pholistoma membranaceum,white fiesta flower,N,37886,6456,57352,White Fiesta Flower
Phoradendron leucarpum subsp. macrophyllum,big leaf mistletoe,N,98417,13230,166732,Colorado Desert Mistletoe
Phoradendron leucarpum subsp. tomentosum,oak mistletoe,N,98416,13231,545020,Pacific Mistletoe
Phoradendron leucarpum subsp. tomentosum,oak mistletoe,N,98416,13231,343310,Pacific Mistletoe
Phragmites australis,common reed,N,37931,6465,64237,Common Reed
Phyla lanceolata,,N,37942,6466,59042,Lanceleaf Fogfruit
Phyla nodiflora,,N,37943,6467,59040,Common Lippia,perennial,white,5,11
Expand Down Expand Up @@ -1526,7 +1526,7 @@ Ranunculus orthorhynchus var. bloomeri,,N,64956,7051,81337,Bloomer's Buttercup
Ranunculus orthorhynchus var. orthorhynchus,,N,64957,7052,81338,Straightbeak Buttercup
Ranunculus repens,,X,40965,7056,48229
Ranunculus sceleratus,cursed crowsfoot,N,40971,7059,59301,Cursed Buttercup
Ranunculus sceleratus var. sceleratus,,X,64987,11983,,,annual,yellow,4,6
Ranunculus sceleratus var. sceleratus,,X,64987,11983,81341,,annual,yellow,4,6
Raphanus raphanistrum,jointed charlock,X,40991,7063,55411
Raphanus sativus,wild radish,X,40992,7064,995125
Rhamnus alaternus,Italian buckthorn,X,81104,9447,82856
Expand Down
316 changes: 316 additions & 0 deletions lib/tools/inat.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
import path from "node:path";
import { Files } from "../files.js";
import { CSV } from "../csv.js";
import { sleep } from "../util.js";
import { TaxaCSV } from "./taxacsv.js";

/**
* @typedef {{id:string,
* name:string,
* phylum:string,
* rank:string,
* scientificName:string,
* specificEpithet:string
* }} INatCSVData
*/

export class INat {
/** @type {Object<string,InatTaxon>} */
static #taxa = {};

/**
* @param {string} toolsDataDir
* @param {string} dataDir
* @param {Taxa} taxa
* @param {import("../exceptions.js").Exceptions} exceptions
* @param {ErrorLog} errorLog
* @param {string} csvFileName
* @param {boolean} update
*/
static async analyze(
toolsDataDir,
dataDir,
taxa,
exceptions,
errorLog,
csvFileName,
update,
) {
const inatDataDir = toolsDataDir + "/inat";
const csvFilePath = inatDataDir + "/" + csvFileName;

// Create data directory if it's not there.
Files.mkdir(inatDataDir);

// Download the data file if it doesn't exist.
if (!Files.exists(csvFilePath)) {
const url =
"https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip";
const zipFileName = path.basename(url);
const zipFilePath = inatDataDir + "/" + zipFileName;
console.log("retrieving iNaturalist species");
await Files.fetch(url, zipFilePath);
await Files.zipFileExtract(zipFilePath, "taxa.csv", csvFilePath);
}

console.log("loading iNaturalist species");
await CSV.parseStream(
inatDataDir,
csvFileName,
undefined,
undefined,
this.#checkTaxon,
);
console.log("iNat: " + Object.keys(this.#taxa).length + " taxa loaded");

const missingTaxa = [];

/**@type {Map<string,string>} */
const idsToUpdate = new Map();

for (const taxon of taxa.getTaxonList()) {
const name = taxon.getName();
if (name.includes(" unknown")) {
continue;
}
const iNatName = taxon.getINatName();
const iNatTaxon = this.#taxa[iNatName];
if (!iNatTaxon) {
if (!exceptions.hasException(name, "inat", "notintaxondata")) {
errorLog.log(name, "not found in " + csvFileName, iNatName);
}
missingTaxa.push({ name: name, iNatName: iNatName });
continue;
}
if (iNatTaxon.getID() !== taxon.getINatID()) {
errorLog.log(
name,
"iNat ID in " +
csvFileName +
" does not match ID in taxa.csv",
iNatTaxon.getID(),
taxon.getINatID(),
);
idsToUpdate.set(name, iNatTaxon.getID());
}
}

console.log("iNat: looking up missing names");
for (const data of missingTaxa) {
await this.#findCurrentName(
taxa,
exceptions,
errorLog,
data.name,
data.iNatName,
);
}

this.#checkExceptions(taxa, exceptions, errorLog);

if (update) {
updateTaxaCSV(dataDir, idsToUpdate);
}
}

/**
*
* @param {Taxa} taxa
* @param {import("../exceptions.js").Exceptions} exceptions
* @param {ErrorLog} errorLog
*/
static #checkExceptions(taxa, exceptions, errorLog) {
// Check the iNat exceptions and make sure they still apply.
for (const [name, v] of exceptions.getExceptions()) {
const exceptions = v.inat;
if (!exceptions) {
continue;
}

// Make sure the taxon is still in our list.
const taxon = taxa.getTaxon(name);
if (!taxon) {
// Don't process global exceptions if taxon is not in local list.
if (taxa.isSubset() && !v.local) {
continue;
}
errorLog.log(name, "has iNat exceptions but not in taxa.tsv");
continue;
}

for (const [k] of Object.entries(exceptions)) {
const iNatData = INat.#taxa[name];
switch (k) {
case "notintaxondata":
if (iNatData) {
errorLog.log(
name,
"found in iNat data but has notintaxondata exception",
);
}
break;
default:
errorLog.log(name, "unrecognized iNat exception", k);
}
}
}
}

/**
* @param {INatCSVData} record
*/
static #checkTaxon(record) {
if (record["phylum"] === "Tracheophyta" && record["specificEpithet"]) {
const name = record["scientificName"];
INat.#taxa[name] = new InatTaxon(record["id"]);
}
}

/**
*
* @param {Taxa} taxa
* @param {import("../exceptions.js").Exceptions} exceptions
* @param {ErrorLog} errorLog
* @param {string} name
* @param {string} iNatName
*/
static async #findCurrentName(taxa, exceptions, errorLog, name, iNatName) {
/**
* @param {{matched_term:string,name:string,rank:string}[]} results
* @param {string} iNatName
*/
function findMatchingResult(results, iNatName) {
if (results.length === 1) {
return results[0];
}
let match;
for (const result of results) {
if (result.matched_term === iNatName) {
if (match) {
errorLog.log(
iNatName,
"found more than one matched_term",
match.matched_term,
result.matched_term,
);
return;
}
match = result;
}
}
return match;
}

const url = new URL("https://api.inaturalist.org/v1/taxa");
url.searchParams.set("q", iNatName);

const response = await fetch(url);
const data = await response.json();

/** @type {{name:string,rank:string}|undefined} */
let result = findMatchingResult(data.results, iNatName);
if (result === undefined) {
const parts = iNatName.split(" ");
switch (parts.length) {
case 2:
// If it's "genus species", try "genus species species".
parts.push(parts[1]);
iNatName = parts.join(" ");
result = findMatchingResult(data.results, iNatName);
break;
case 3:
// If it's "genus species species", try "genus species".
if (parts[1] === parts[2]) {
iNatName = parts[0] + " " + parts[1];
result = findMatchingResult(data.results, iNatName);
}
break;
}
}

if (result === undefined) {
if (!exceptions.hasException(name, "inat", "notintaxondata")) {
errorLog.log(name, "iNat lookup found no results");
// Make sure this doesn't have an iNat ID.
const iNatID = taxa.getTaxon(name).getINatID();
if (iNatID) {
errorLog.log(
name,
"iNat lookup failed but has iNat ID",
iNatID,
);
}
}
} else {
errorLog.log(
name,
"found iNat synonym",
this.makeSynonymName(result, errorLog) + "," + name + ",INAT",
);
}

// Delay to throttle queries to iNat API.
await sleep(800);
}

/**
* @param {{name:string,rank:string}} iNatResult
* @param {ErrorLog} errorLog
*/
static makeSynonymName(iNatResult, errorLog) {
const synParts = iNatResult.name.split(" ");
if (synParts.length === 3) {
switch (iNatResult.rank) {
case "subspecies":
case "variety":
synParts[3] = synParts[2];
synParts[2] =
iNatResult.rank === "variety" ? "var." : "subsp.";
break;
case "hybrid":
// Leave as is.
break;
default:
errorLog.log(
iNatResult.name,
"unrecognized iNat rank",
iNatResult.rank,
);
}
}
return synParts.join(" ");
}
}

class InatTaxon {
#id;

/**
* @param {string} id
*/
constructor(id) {
this.#id = id;
}

getID() {
return this.#id;
}
}

/**
* @param {string} dataDir
* @param {Map<string,string>} idsToUpdate
*/
function updateTaxaCSV(dataDir, idsToUpdate) {
const taxa = new TaxaCSV(dataDir);

for (const taxonData of taxa.getTaxa()) {
const id = idsToUpdate.get(taxonData.taxon_name);
if (!id) {
continue;
}
taxonData["inat id"] = id;
}

taxa.write();
}
17 changes: 10 additions & 7 deletions scripts/cpl-tools.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { Calflora } from "../lib/tools/calflora.js";
import { Exceptions } from "../lib/exceptions.js";
import { ErrorLog } from "../lib/errorlog.js";
import { Calscape } from "../lib/tools/calscape.js";
import { INat } from "../lib/tools/inat.js";

const TOOLS = {
CALFLORA: "calflora",
Expand Down Expand Up @@ -72,13 +73,15 @@ async function build(program, options) {
);
break;
case TOOLS.INAT:
// await INat.analyze(
// TOOLS_DATA_DIR,
// taxa,
// exceptions,
// errorLog,
// options.inTaxafile,
// );
await INat.analyze(
TOOLS_DATA_DIR,
options.datadir,
taxa,
exceptions,
errorLog,
options.inTaxafile,
!!options.update,
);
break;
case TOOLS.JEPSON_EFLORA: {
// const eflora = new JepsonEFlora(
Expand Down
Loading