Skip to content

Commit

Permalink
fixed the classification predictions; converts tags to ids;
Browse files Browse the repository at this point in the history
  • Loading branch information
ranjan-stha committed Nov 28, 2023
1 parent 82d27c6 commit a21f360
Show file tree
Hide file tree
Showing 6 changed files with 1,300 additions and 42 deletions.
11 changes: 11 additions & 0 deletions handlers/ecs/entryextraction/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SECTOR_VERSION = "1.0.0"
SUBPILLAR_VERSION = "1.0.0"
SUBPILLARS_1D_VERSION = "1.0.0"
SUBPILLARS_2D_VERSION = "1.0.0"
AGE_VERSION = "1.0.0"
GENDER_VERSION = "1.0.0"
SPECIFIC_NEEDS_GROUP_VERSION = "1.0.0"
SEVERITY_VERSION = "1.0.0"
AFFECTED_GRP_VERSION = "1.0.0"
DEMOGRAPHIC_GROUP_VERSION = "1.0.0"
RELIABILITY_VERSION = "1.0.0"
87 changes: 45 additions & 42 deletions handlers/ecs/entryextraction/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import pandas as pd
from botocore.exceptions import ClientError

from tags import total_tags
#from tags import total_tags
from tags_to_ids import get_model_tags_mappings
from postprocess_tags import convert_current_dict_to_previous_one
from utils import get_words_count
from const import (
HIGH_LEVEL_TAG_GROUPS,
Expand All @@ -14,13 +16,12 @@
CLASSIFICATION_MODEL_VERSION
)


logging.getLogger().setLevel(logging.INFO)
client = boto3.session.Session().client("sagemaker-runtime", region_name="us-east-1")


def get_outputs_from_endpoint_text(document: str, endpoint_name: str):

""" Send request to sagemaker endpoint to get the tag predictions """
inputs = pd.DataFrame(document, columns=["excerpt"])
inputs["return_type"] = "default_analyis"
inputs["analyis_framework_id"] = "all"
Expand Down Expand Up @@ -127,48 +128,48 @@ def reformat_old_output(output: list):
return reformat


def get_tag_ids(total_tags, taglist, idx=0):
"""
Retrieves the tag IDs
"""
for tag in total_tags[idx]:
if tag["key"] == taglist[idx]:
if idx >= len(total_tags) - 1:
return [tag.get("id", None)]
else:
return [tag.get("id", None)] + get_tag_ids(
total_tags, taglist, idx=idx + 1
)
return [None]
# def get_tag_ids(total_tags, taglist, idx=0):
# """
# Retrieves the tag IDs
# """
# for tag in total_tags[idx]:
# if tag["key"] == taglist[idx]:
# if idx >= len(total_tags) - 1:
# return [tag.get("id", None)]
# else:
# return [tag.get("id", None)] + get_tag_ids(
# total_tags, taglist, idx=idx + 1
# )
# return [None]



def convert_prediction(pred, thresholds):
# def convert_prediction(pred, thresholds):

tag_preds = {}
for label, prob in pred.items():
firstlabel, secondlabel, thirdlabel = get_tag_ids(
total_tags, label.split("->")
)
if not (firstlabel and secondlabel and thirdlabel):
continue
if firstlabel not in tag_preds:
tag_preds[firstlabel] = {}
if secondlabel not in tag_preds[firstlabel]:
tag_preds[firstlabel][secondlabel] = {}
if thirdlabel not in tag_preds[firstlabel][secondlabel]:
tag_preds[firstlabel][secondlabel][thirdlabel] = {
"prediction": prob,
"threshold": thresholds[label],
"is_selected": prob > thresholds[label],
}
# tag_preds = {}
# for label, prob in pred.items():
# firstlabel, secondlabel, thirdlabel = get_tag_ids(
# total_tags, label.split("->")
# )
# if not (firstlabel and secondlabel and thirdlabel):
# continue
# if firstlabel not in tag_preds:
# tag_preds[firstlabel] = {}
# if secondlabel not in tag_preds[firstlabel]:
# tag_preds[firstlabel][secondlabel] = {}
# if thirdlabel not in tag_preds[firstlabel][secondlabel]:
# tag_preds[firstlabel][secondlabel][thirdlabel] = {
# "prediction": prob,
# "threshold": thresholds[label],
# "is_selected": prob > thresholds[label],
# }

return tag_preds
# return tag_preds



def create_final_output(output: dict, classification_results: dict):

""" Generate the final output """
blocks = output["blocks"]
true_indexes = classification_results["predictions"]==1
selected = np.array(classification_results["indexes"])[true_indexes]
Expand All @@ -178,16 +179,18 @@ def create_final_output(output: dict, classification_results: dict):
if block.get("type") == "text":
block.update({
"relevant": False,
"classification": None
"prediction_status": False,
"classification": {}
})

for i, j in zip(selected, pred_vector):

block = blocks[i]
pred = convert_prediction(classification_results["raw_predictions"][j],
classification_results["thresholds"])
tags_pred = convert_current_dict_to_previous_one(classification_results["raw_predictions"][j])
tags_threshold = convert_current_dict_to_previous_one(classification_results["thresholds"])
pred = get_model_tags_mappings(tags_pred, tags_threshold)
block.update({
"relevant": True,
"prediction_status": True,
"classification": pred
})

Expand Down Expand Up @@ -219,7 +222,7 @@ def __init__(
std_multiplier: float=None,
length_weight: float=None,
):

self.model_endpoint = model_endpoint
self.selected_tags = selected_tags
self.mean_percentage = mean_percentage
Expand Down Expand Up @@ -279,7 +282,7 @@ def predict(self, document):
thres = thres+self.length_weight*np.log(len(document))
prediction[np.where(means>=thres)] = 1
results.update({"predictions": prediction})

return create_final_output(
output=document,
classification_results=results
Expand Down
173 changes: 173 additions & 0 deletions handlers/ecs/entryextraction/postprocess_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
from typing import List, Dict
from collections import defaultdict

pillars_1d_tags = [
"Covid-19",
"Casualties",
"Context",
"Displacement",
"Humanitarian access",
"Shock/event",
"Information And Communication",
"Information and communication",
]

pillars_2d_tags = [
"At risk",
"Priority Interventions",
"Capacities & response",
"Humanitarian conditions",
"Impact",
"Priority Needs",
"Priority interventions",
"Priority needs",
]

secondary_tags = [
"Age",
"Gender",
"affected_groups",
"specific_needs_groups",
"severity",
"Displaced", ##
"Non displaced"
]


def get_preds_entry(
preds_column: Dict[str, float],
return_at_least_one=True,
ratio_nb=1,
return_only_one=False,
):
if return_only_one:
preds_entry = [
sub_tag
for sub_tag, ratio in preds_column.items()
if ratio == max(list(preds_column.values()))
]
else:
preds_entry = [
sub_tag for sub_tag, ratio in preds_column.items() if ratio > ratio_nb
]
if return_at_least_one and len(preds_entry) == 0:
preds_entry = [
sub_tag
for sub_tag, ratio in preds_column.items()
if ratio == max(list(preds_column.values()))
]
return preds_entry


def get_predictions_all(
ratios_entries: List[Dict[str, float]],
pillars_2d=pillars_2d_tags,
pillars_1d=pillars_1d_tags,
ratio_nb: int = 1,
):

predictions = defaultdict(list)
for ratio_proba_threshold_one_entry in ratios_entries:
returns_sectors = ratio_proba_threshold_one_entry["primary_tags"]["sectors"]

subpillars_2d_tags = ratio_proba_threshold_one_entry["primary_tags"][
"subpillars_2d"
]
subpillars_1d_tags = ratio_proba_threshold_one_entry["primary_tags"][
"subpillars_1d"
]

ratios_sectors_subpillars_2d = list(returns_sectors.values()) + list(
subpillars_2d_tags.values()
)

if any([item >= ratio_nb for item in ratios_sectors_subpillars_2d]):
preds_2d = get_preds_entry(subpillars_2d_tags, True, ratio_nb)
preds_sectors = get_preds_entry(returns_sectors, True, ratio_nb)

else:
preds_2d = []
preds_sectors = []

predictions["sectors"].append(preds_sectors)
predictions["subpillars_2d"].append(preds_2d)

preds_1d = get_preds_entry(subpillars_1d_tags, False, ratio_nb)
predictions["subpillars_1d"].append(preds_1d)

returns_sec_tags = ratio_proba_threshold_one_entry["secondary_tags"]

for secondary_tag in [
"Age",
"Gender",
"affected_groups",
"specific_needs_groups",
"Displaced", ##
"Non displaced"
]:
preds_one_sec_tag = get_preds_entry(
returns_sec_tags[secondary_tag], False, ratio_nb
)

predictions[secondary_tag].append(preds_one_sec_tag)

severity_tags = returns_sec_tags["severity"]
if any(["Humanitarian Conditions" in item for item in preds_2d]):
preds_severity = get_preds_entry(severity_tags, True, ratio_nb, True)
else:
preds_severity = []
predictions["severity"].append(preds_severity)

return predictions


def flatten(t: List[List]) -> List:
return [item for sublist in t for item in sublist]


def convert_current_dict_to_previous_one(
ratios_one_entry: Dict[str, float]
) -> Dict[str, Dict[str, float]]:

# prim tags
primary_tags_results = {"sectors": {}, "subpillars_2d": {}, "subpillars_1d": {}}

# sec tags
secondary_tags_results = {
"Age": {},
"Gender": {},
"affected_groups": {},
"specific_needs_groups": {},
"severity": {},
"Displaced": {}, ##
"Non displaced": {}
}

for tag, number in ratios_one_entry.items():
tag_levels = tag.split("->")
if tag_levels[0].startswith("subpillars"): #"subpillars" == tag_levels[0]:

assert tag_levels[1] in pillars_1d_tags or tag_levels[1] in pillars_2d_tags

if tag_levels[1] in pillars_1d_tags:
subpillar_name = "subpillars_1d"
else:
subpillar_name = "subpillars_2d"

primary_tags_results[subpillar_name]["->".join(tag_levels[1:])] = number

elif "secondary_tags" == tag_levels[0]:
assert tag_levels[1] in secondary_tags

secondary_tags_results[tag_levels[1]][tag_levels[2]] = number

else:
if "sectors" == tag_levels[1]:
primary_tags_results["sectors"][tag_levels[2]] = number

outputs = {
"primary_tags": primary_tags_results,
"secondary_tags": secondary_tags_results,
}

return outputs
Loading

0 comments on commit a21f360

Please sign in to comment.