diff --git a/data_quality/check_wiki_keys.py b/data_quality/check_wiki_keys.py index e2ec2c1..ef37aa2 100644 --- a/data_quality/check_wiki_keys.py +++ b/data_quality/check_wiki_keys.py @@ -1,4 +1,5 @@ import sys +from tqdm import tqdm sys.path.append("oswm_codebase") from functions import * @@ -7,10 +8,10 @@ wiki_absence_dict = {} -for category in tags_dict: +for category in tqdm(tags_dict): wiki_absence_dict[category] = [] - for osm_key in tags_dict[category]: - print("testing ", osm_key) + for osm_key in tqdm(tags_dict[category], desc=category): + # print("testing ", osm_key) if not check_if_wikipage_exists(osm_key): print(" ", osm_key, " absent!!") diff --git a/data_quality/quality_dicts.py b/data_quality/quality_dicts.py index 6ee9ad8..0522be5 100644 --- a/data_quality/quality_dicts.py +++ b/data_quality/quality_dicts.py @@ -4,8 +4,6 @@ """ -feature_categories = ['sidewalks','crossings','kerbs'] - """ template dict: @@ -13,218 +11,176 @@ 'sidewalks': {}, 'crossings': {}, 'kerbs': {}, + 'other_footways': {}, } """ improper_keys = { - 'sidewalks': { - 'kerb':'sidewalks are drawn at path axis, kerb acess points should be literally at the kerb ("meio-fio", pt-br)', - 'opening_hours':'if it has opening hours it may be a private pathway, not a sidewalk', - 'paving_stones':'paving stones is a value for "surface key"','crossing':"It's inappropriate for Sidewalks, probably mistakenly tagged", - 'barrier':"if there's a barrier it may be a node in the sidewalk, but not the sidewalk itself", - - }, - - 'crossings': { - 'kerb': 'kerbs are points, crossings are lines', - 'barrier':"a crossing with a barrier may not be an actual crossing..." - - }, - - 'kerbs': { - 'opening_hours':'a crossing may have opening hours (brigdes?), but not a kerb', - 'crossing':"It's inappropriate for Kerbs, it's for crossings", - 'crossing_ref' :"It's inappropriate for Kerbs, it's for crossings", - 'name':'most kerbs have no name', - 'traffinc_calming':'generally the trafic calming refers to the crossing' - - + "sidewalks": { + "kerb": 'sidewalks are drawn at path axis, kerb acess points should be literally at the kerb ("meio-fio", pt-br)', + "opening_hours": "if it has opening hours it may be a private pathway, not a sidewalk", + "paving_stones": 'paving stones is a value for "surface key"', + "crossing": "It's inappropriate for Sidewalks, probably mistakenly tagged", + "barrier": "if there's a barrier it may be a node in the sidewalk, but not the sidewalk itself", + }, + "crossings": { + "kerb": "kerbs are points, crossings are lines", + "barrier": "a crossing with a barrier may not be an actual crossing...", + }, + "kerbs": { + "opening_hours": "a crossing may have opening hours (brigdes?), but not a kerb", + "crossing": "It's inappropriate for Kerbs, it's for crossings", + "crossing_ref": "It's inappropriate for Kerbs, it's for crossings", + "name": "most kerbs have no name", + "traffic_calming": "generally the trafic calming refers to the crossing", }, } uncanny_keys = { - 'sidewalks': { - 'traffic_signals':'may be used for crossings', - 'name':'most sidewalks dont have an actual name' - }, - 'crossings': {'level':'according to wiki it may be used only for indoor or if bound to a floor...'}, - 'kerbs': { - 'button_operated': "it may be referring to the crossing, may be OK", - 'traffic_signals:sound':"it may be referring to the crossing, may be OK", + "sidewalks": { + "traffic_signals": "may be used for crossings", + "name": "most sidewalks dont have an actual name", + }, + "crossings": { + "level": "according to wiki it may be used only for indoor or if bound to a floor..." + }, + "kerbs": { + "button_operated": "it may be referring to the crossing, may be OK", + "traffic_signals:sound": "it may be referring to the crossing, may be OK", "traffic_signals:vibration": "it may be referring to the crossing, may be OK", - "crossing:island" : "if in the middle of a crossing It's fine!! " - - } - , + "crossing:island": "if in the middle of a crossing It's fine!! ", + }, } replaceable_values = { - 'sidewalks': {}, - 'crossings': {'crossing':{'marked':'uncontrolled','zebra':'uncontrolled','island':"should use the TAG 'crossing:island=yes' "}}, - 'kerbs': {}, + "sidewalks": {}, + "crossings": { + "crossing": { + "marked": "uncontrolled", + "zebra": "uncontrolled", + "island": "should use the TAG 'crossing:island=yes' ", + } + }, + "kerbs": {}, } invalid_characters = { - '=' : "The '=' character is used ONLY in textual representation of tags to separate the key from the value", + "=": "The '=' character is used ONLY in textual representation of tags to separate the key from the value", } - - categories_dict_keys = { "improper_keys": { - 'about':"Keys that (almost certainly) shouldn't be used at that feature type", - 'dict' : improper_keys, - 'type':'keys', - - 'occurrences': { - 'sidewalks': {}, - 'crossings': {}, - 'kerbs': {}, - }, - - - 'occ_count': { - 'sidewalks': 0, - 'crossings': 0, - 'kerbs': 0, - }, - - + "about": "Keys that (almost certainly) shouldn't be used at that feature type", + "dict": improper_keys, + "type": "keys", + "occurrences": { + "sidewalks": {}, + "crossings": {}, + "kerbs": {}, + }, + "occ_count": { + "sidewalks": 0, + "crossings": 0, + "kerbs": 0, + }, }, - "uncanny_keys": { - 'about':"Keys that may be OK in some specific situations, but may be a mistake", - 'dict' : uncanny_keys, - 'type':'keys', - - 'occurrences': { - 'sidewalks': {}, - 'crossings': {}, - 'kerbs': {}, - }, - - 'occ_count': { - 'sidewalks': 0, - 'crossings': 0, - 'kerbs': 0, - }, - + "about": "Keys that may be OK in some specific situations, but may be a mistake", + "dict": uncanny_keys, + "type": "keys", + "occurrences": { + "sidewalks": {}, + "crossings": {}, + "kerbs": {}, + }, + "occ_count": { + "sidewalks": 0, + "crossings": 0, + "kerbs": 0, + }, }, - - - "keys_without_wiki": { - 'about':"Keys that may be wrong, because there's no wiki article for it", - 'dict' : 'quality_check/keys_without_wiki.json', - 'type':'keys', - - 'occurrences': { - 'sidewalks': {}, - 'crossings': {}, - 'kerbs': {}, - }, - - 'occ_count': { - 'sidewalks': 0, - 'crossings': 0, - 'kerbs': 0, - }, - + "keys_without_wiki": { + "about": "Keys that may be wrong, because there's no wiki article for it", + "dict": "quality_check/keys_without_wiki.json", + "type": "keys", + "occurrences": { + "sidewalks": {}, + "crossings": {}, + "kerbs": {}, }, - + "occ_count": { + "sidewalks": 0, + "crossings": 0, + "kerbs": 0, + }, + }, "replaceable_values": { - 'about':"values that are not wrong, but there's a better option that is in the commentary", - 'dict' : replaceable_values, - 'type':'values', - - 'occurrences': { - 'sidewalks': {}, - 'crossings': {}, - 'kerbs': {}, - }, - - - 'occ_count': { - 'sidewalks': 0, - 'crossings': 0, - 'kerbs': 0, - }, - + "about": "values that are not wrong, but there's a better option that is in the commentary", + "dict": replaceable_values, + "type": "values", + "occurrences": { + "sidewalks": {}, + "crossings": {}, + "kerbs": {}, + }, + "occ_count": { + "sidewalks": 0, + "crossings": 0, + "kerbs": 0, + }, }, - "wrong_mispelled_or_unlisted_values": { - 'about':"Values that are probably wrong, but they may be mispelled or just unlisted", - 'dict' : 'quality_check/valid_tag_values.json', - 'type':'values', - - 'occurrences': { - 'sidewalks': {}, - 'crossings': {}, - 'kerbs': {}, - }, - - - 'occ_count': { - 'sidewalks': 0, - 'crossings': 0, - 'kerbs': 0, - }, - + "about": "Values that are probably wrong, but they may be mispelled or just unlisted", + "dict": "quality_check/valid_tag_values.json", + "type": "values", + "occurrences": { + "sidewalks": {}, + "crossings": {}, + "kerbs": {}, + }, + "occ_count": { + "sidewalks": 0, + "crossings": 0, + "kerbs": 0, + }, }, - - "invalid_characters": { - 'about':"characters that should not be in the value or in the key", - 'dict' : invalid_characters, - 'type':'tags', - - 'occurrences': { - 'sidewalks': {}, - 'crossings': {}, - 'kerbs': {}, - }, - - - 'occ_count': { - 'sidewalks': 0, - 'crossings': 0, - 'kerbs': 0, - }, - + "about": "characters that should not be in the value or in the key", + "dict": invalid_characters, + "type": "tags", + "occurrences": { + "sidewalks": {}, + "crossings": {}, + "kerbs": {}, + }, + "occ_count": { + "sidewalks": 0, + "crossings": 0, + "kerbs": 0, + }, }, - # "missing_value": { # 'about':"tags with only a key, no value", # 'dict' : None, # 'type':'tags', - # 'occurrences': { # 'sidewalks': {}, # 'crossings': {}, # 'kerbs': {}, # }, - - - # 'occ_count': { # 'sidewalks': 0, # 'crossings': 0, # 'kerbs': 0, # }, - # }, - - - } occurrence_per_feature = { - 'sidewalks': {}, - 'crossings': {}, - 'kerbs': {}, + "sidewalks": {}, + "crossings": {}, + "kerbs": {}, } - - - diff --git a/data_quality/tag_values_checking.py b/data_quality/tag_values_checking.py index 4713a0d..4133e34 100644 --- a/data_quality/tag_values_checking.py +++ b/data_quality/tag_values_checking.py @@ -1,31 +1,21 @@ import sys -sys.path.append('oswm_codebase') -from functions import * -# from constants import * - -sidewalks_gdf = gpd.read_parquet(sidewalks_path_raw) -crossings_gdf = gpd.read_parquet(crossings_path_raw) -kerbs_gdf = gpd.read_parquet(kerbs_path_raw) -gdf_dict = {'sidewalks':sidewalks_gdf,'crossings':crossings_gdf,'kerbs':kerbs_gdf} - -sidewalks_columns = print_relevant_columnamesV2(sidewalks_gdf) -record_to_json('sidewalks',sidewalks_columns,feat_keys_path) -crossings_columns = print_relevant_columnamesV2(crossings_gdf) -record_to_json('crossings',crossings_columns,feat_keys_path) -kerbs_columns = print_relevant_columnamesV2(kerbs_gdf) -record_to_json('kerbs',kerbs_columns,feat_keys_path) +sys.path.append("oswm_codebase") +from functions import * -columns_dict = {'sidewalks':sidewalks_columns,'crossings':crossings_columns,'kerbs':kerbs_columns} +gdf_dict = get_gdfs_dict(raw_data=True) +columns_dict = read_json(feat_keys_path) unique_values_dict = {} for category in columns_dict: unique_values_dict[category] = {} for osmkey in columns_dict[category]: - unique_values_dict[category][osmkey] = list(gdf_dict[category][osmkey].unique()) + unique_values_dict[category][osmkey] = list( + gdf_dict[category][osmkey].dropna().unique() + ) -dump_json(unique_values_dict,unique_values_path) +dump_json(unique_values_dict, unique_values_path) valid_tag_values = {} @@ -33,13 +23,13 @@ valid_tag_values[category] = {} for osmkey in fields_values_properties[category]: # excluding musthave keys that are real numbers, the rules must be applied in another fashion - if osmkey not in ('width','incline','incline:across'): + if osmkey not in ("width", "incline", "incline:across"): valid_tag_values[category][osmkey] = [] for valid_value in fields_values_properties[category][osmkey]: if valid_value: - if valid_value not in ('?'): + if valid_value not in ("?"): valid_tag_values[category][osmkey].append(valid_value) -dump_json(valid_tag_values,valid_values_path) \ No newline at end of file +dump_json(valid_tag_values, valid_values_path) diff --git a/filtering_adapting_data.py b/filtering_adapting_data.py index eb1442b..3231e72 100644 --- a/filtering_adapting_data.py +++ b/filtering_adapting_data.py @@ -100,6 +100,7 @@ "nodes", "element_type", "id", + "ways", ] ]