Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SCRIPTS: Add script to merge geojson files and format them in a MapRoulette tag fix friendly way #376

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 210 additions & 0 deletions scripts/merge_geojson/merge_geojson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/usr/bin/env python
"""
Merge geojson files of the same type in the same directory, optionally to a
newline delimited geojson file compliant with RFC 7464 for MapRoulette.
"""
import argparse
import geojson
import typing
import glob
import gzip
import re
import os
import json
import logging

ASCII_RECORD_SEPARATOR = b"\x1E"
ASCII_LINE_SEPARATOR = b"\x0A"


def get_geojson_files(country_path: str, flag: str) -> typing.List[str]:
"""
Get geojson files in a path for a flag
"""
files = glob.glob(
os.path.join(country_path, "**", "*" + flag + "*"), recursive=True
)
return [g for g in files if re.search(r".*\.geojson(|\.gz)$", g)]


def fix_suggestions_to_cooperativeWork(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is great, thanks for adding it.
We're working on bringing in a pyatlas-checks library to house some MapRoulette related code integrated with the maproulette-python-client.

Once we have that checked in, we can repurpose this code within that cli.

Also.. check out this ticket: osmlab/maproulette-python-client#63. If you have time, it would be awesome if you could additionally add a Cooperative Challenge model for the python client.

feature: geojson.Feature,
) -> geojson.FeatureCollection:
"""
This is only useful with RFC7464 compliant geojson. This converts
fix_suggestions to cooperativeWork tasks. It currently only covers tag
fixes and not geometry fixes. It also does not support added or removed
features at this time.
"""
properties = feature["properties"]
# The id MUST be in the form of <elementType>/<numericId>, such as node/42
# Checked id order is id, @id, osmid, osm_id, and name (in order)
# Currently assuming that there is only one osmIdentifier per feature
osm_id = properties["feature_properties"][0]["osmIdentifier"]
if feature["geometry"]["type"] in ["Position", "Point"]:
feature["id"] = "node/" + osm_id
elif feature["geometry"]["type"] in [
"LineString",
"Polygon",
]: # TODO check polygon
feature["id"] = "way/" + osm_id
elif feature["geometry"]["type"] in [
"MultiPolygon",
"MultiPoint",
"MultiLineString",
]:
feature["id"] = "relation/" + osm_id
# version=2 is required, type=2 is for osc base64 encoded (i.e. geometry
# changes). type=1 is for tag fixes
cooperativeWork = {"meta": {"version": 2, "type": 1}}
operations = []
for osm_object in properties["fix_suggestions"]:
ignore_keys = [
"last_edit_version",
"last_edit_user_name",
"last_edit_user_id",
"last_edit_time",
"last_edit_changeset",
]
fix_suggestions = properties["fix_suggestions"][osm_object]
# Cooperative challenges supports setTags and unsetTags
if fix_suggestions["type"] in ["UPDATE"]:
add_tags = {}
del_tags = []
for descriptor in fix_suggestions["descriptors"]:
if (
descriptor["name"] == "TAG"
and descriptor["key"] not in ignore_keys
):
if descriptor["type"] in ["ADD", "UPDATE"]:
add_tags[descriptor["key"]] = descriptor["value"]
elif descriptor["type"] in ["REMOVE"]:
del_tags.append(descriptor["key"])
elif descriptor["name"] not in ["TAG"]:
logging.error(
f'Fixes for {descriptor["name"]} are not yet supported'
)
logging.debug(str(descriptor))
if len(add_tags) > 0:
operations.append({"operation": "setTags", "data": add_tags})
if len(del_tags) > 0:
operations.append({"operation": "unsetTags", "data": del_tags})
else:
# We currently don't support "ADD" or "REMOVE" for objects
logging.error(f'Unknown type {fix_suggestions["type"]}')
logging.debug(str(fix_suggestions))
cooperativeWork["operations"] = [
{
"operationType": "modifyElement",
"data": {"id": feature["id"], "operations": operations},
}
]
feature_collection = geojson.FeatureCollection([feature])
if len(operations) > 0:
# cooperativeWork goes in the main FeatureCollection body
feature_collection["cooperativeWork"] = cooperativeWork
# properties["cooperativeWork"] = cooperativeWork
del properties["fix_suggestions"]
return feature_collection


def write_feature_rfc7464(file_name: str, feature: geojson.Feature):
"""
Create an RFC7464 compliant file for use with MapRoulette
"""
fc = (
fix_suggestions_to_cooperativeWork(feature)
if "fix_suggestions" in feature["properties"]
else geojson.FeatureCollection([feature])
)
with open(file_name, "ab") as fh:
# Line-by-line uses the 1E byte record separator
fh.write(ASCII_RECORD_SEPARATOR)

with open(file_name, "a") as fh:
string = geojson.dumps(fc)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am guessing this is just calling json.dumps, in which case we probably want to set ensure_ascii=False to maintain unicode characters.

fh.write(string)

with open(file_name, "ab") as fh:
# Specific line separator (byte used instead of specific character,
# just in case)
fh.write(ASCII_LINE_SEPARATOR)


def read_file(file_name: str) -> geojson.FeatureCollection:
file_type = "gz" if file_name.endswith(".gz") else "geojson"
try:
if file_type == "geojson":
with open(file_name, "r") as fh:
first = str.encode(fh.read(1))
if first == ASCII_RECORD_SEPARATOR:
return None
fh.seek(0)
return geojson.load(fh)
elif file_type == "gz":
with gzip.open(file_name, "rt") as fh:
return geojson.load(fh)
except json.decoder.JSONDecodeError as e:
logging.error(f"{file_name} had a json parse error")
raise e

raise ValueError(f"{file_name} is not a recognized type")


def line_by_line(files: typing.List[str], check: str):
write_name = check + ".line.geojson"
if len(files) > 0:
write_name = os.path.join(os.path.dirname(files[0]), write_name)
if os.path.isfile(write_name):
os.remove(write_name)
for f in files:
fc = read_file(f)
if fc is None:
continue
if isinstance(fc, geojson.FeatureCollection):
for feature in fc["features"]:
write_feature_rfc7464(write_name, feature)
else:
logging.error(f"Bad file? {f}")


def concat_files(files: typing.List[str], check: str):
write_name = check + ".concat.geojson"
if len(files) > 0:
write_name = os.path.join(os.path.dirname(files[0]), write_name)
featurecollection = None
for f in files:
fc = read_file(f)
if fc is None:
continue
if featurecollection is None:
featurecollection = fc
else:
featurecollection["features"] = (
featurecollection["features"] + fc["features"]
)
with open(write_name, "w") as fp:
geojson.dump(featurecollection, fp)


def main():
parser = argparse.ArgumentParser(
description="Create a line-by-line geojson file"
)
parser.add_argument("files", nargs="+")
parser.add_argument("-c", "--checks", nargs="?")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the use of this argument it appears to be expecting a single check name. So I think it would make more sense to have the flag be --check.

parser.add_argument("--line-by-line", action="store_true")
args = parser.parse_args()
for f in args.files:
logging.info(f)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: more context in this log would be good

if not os.path.isdir(f):
continue
files = get_geojson_files(f, args.checks)
if args.line_by_line:
line_by_line(files, args.checks)
else:
concat_files(files, args.checks)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions scripts/merge_geojson/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
geojson>=2.5.0