diff --git a/Dockerfile b/Dockerfile index ae59afa..b028ece 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,7 @@ FROM python:3.8.2 RUN pip install --upgrade pip RUN pip3 install tqdm RUN pip3 install Cython +RUN pip3 install xaif_eval COPY . /app WORKDIR /app diff --git a/main.py b/main.py index 7be3644..cb5d059 100644 --- a/main.py +++ b/main.py @@ -21,9 +21,9 @@ @handle_errors def turninator_defult(): if request.method == 'POST': - file_obj = request.files.get('file') - turninator = Turninator() - result=turninator.turninator_default(file_obj) + file_obj = request.files.get('file') + turninator = Turninator(file_obj) + result=turninator.turninator_default() return result if request.method == 'GET': diff --git a/src/data.py b/src/data.py index a8ab959..414e891 100644 --- a/src/data.py +++ b/src/data.py @@ -1,43 +1,7 @@ import json from typing import Dict, List -class Data: - def __init__(self, file_obj): - self.file_obj = file_obj - self.f_name = file_obj.filename - self.file_obj.save(self.f_name) - - def is_valid_json(self): - ''' check if the file is valid json - ''' - - try: - json.loads(open(self.f_name).read()) - except ValueError as e: - return False - - return True - def is_valid_json_aif(sel,aif_nodes): - if 'nodes' in aif_nodes and 'locutions' in aif_nodes and 'edges' in aif_nodes: - return True - return False - - def get_aif(self): - if self.is_valid_json(format='xAIF'): - with open(self.f_name) as file: - data = file.read() - x_aif = json.loads(data) - if format == "xAIF": - return x_aif - else: - aif = x_aif.get('AIF') - return json.dumps(aif) - else: - return "Invalid json" - - def get_file_path(self,): - return self.f_name class AIF: def __init__(self, ): @@ -228,7 +192,7 @@ def remove_entries(self, l_node_id, nodes, edges, locutions): return edited_nodes, edited_edges, edited_locutions - def get_xAIF_arrays(self, aif_section: dict, xaif_elements: List) -> tuple: + def get_xAIF_arrays(self, aif_section, xaif_elements) -> tuple: """ Extracts values associated with specified keys from the given AIF section dictionary. diff --git a/src/templates.py b/src/templates.py index 3de11fc..cc2a6a8 100644 --- a/src/templates.py +++ b/src/templates.py @@ -9,8 +9,8 @@ def format_output(nodes, edges, locutions, schemefulfillments, descriptorfulfill aif['schemefulfillments'] = schemefulfillments aif['descriptorfulfillments'] = descriptorfulfillments aif['participants'] = participants - x_aif['AIF'] = aif - x_aif['OVA'] = OVA + x_aif['aif'] = aif + x_aif['ova'] = OVA x_aif['dialog'] = dialog x_aif['text'] = {'txt': text_with_span} return json.dumps(x_aif) diff --git a/src/turninator.py b/src/turninator.py index ee0286c..589bb18 100644 --- a/src/turninator.py +++ b/src/turninator.py @@ -1,15 +1,24 @@ import re from flask import json import logging +from xaif_eval import xaif logging.basicConfig(datefmt='%H:%M:%S', level=logging.DEBUG) -from src.data import AIF, Data +from src.data import AIF from src.templates import TurninatorOutput + + + + + class Turninator(): - def __init__(self) -> None: - pass + def __init__(self,file_obj): + self.file_obj = file_obj + self.f_name = file_obj.filename + self.file_obj.save(self.f_name) + file = open(self.f_name,'r') def dialog_turns(self, text: str) -> str: '''Extract dialog turns from input text using regex.''' @@ -19,19 +28,51 @@ def dialog_turns(self, text: str) -> str: def monolog_text(self, text: str) -> str: '''Extract the entire text if monolog.''' return re.sub('<.*?>', '', text, flags=re.DOTALL) + + def is_valid_json(self): + ''' check if the file is valid json + ''' + + try: + json.loads(open(self.f_name).read()) + except ValueError as e: + return False + + return True + def is_valid_json_aif(self,aif_nodes): + if 'nodes' in aif_nodes and 'locutions' in aif_nodes and 'edges' in aif_nodes: + return True + return False + ### + + def get_aif(self, format='xAIF'): - def turninator_default(self, path_obj): + with open(self.f_name) as file: + data = file.read() + x_aif = json.loads(data) + if format == "xAIF": + return x_aif + else: + aif = x_aif.get('aif') + return json.dumps(aif) + + def turninator_default(self,): # Get the file path from the path object - data = Data(path_obj) - path = data.get_file_path() + + AIF_obj = AIF() + extended_json_aif = {} - if path.endswith("json"): - nodes, edges, locutions = [], [], [] - # Check if the file is a valid JSON file - if data.is_valid_json(): - extended_json_aif = data.get_aif() - if 'AIF' in extended_json_aif and 'text' in extended_json_aif: - json_dict = extended_json_aif['AIF'] + if self.f_name.endswith("json"): + + xAIF_input = self.get_aif() + logging.info(f"xAIF data: {xAIF_input}, {self.file_obj}") + xaif_obj = xaif.AIF(xAIF_input) + is_json_file = self.is_valid_json() + if is_json_file: + nodes, edges, locutions = [], [], [] + extended_json_aif = xaif_obj.xaif + if 'aif' in extended_json_aif and 'text' in extended_json_aif: + json_dict = extended_json_aif['aif'] dialog = extended_json_aif.get('dialog', False) OVA = extended_json_aif.get('OVA', []) # Handle the case where 'json_dict' is a string @@ -43,7 +84,8 @@ def turninator_default(self, path_obj): if not isinstance(json_dict, dict): json_dict = json.loads(json_dict) # Extract values associated with specific keys from the AIF section - schemefulfillments, descriptorfulfillments = AIF.get_xAIF_arrays(['schemefulfillments', 'descriptorfulfillments']) + schemefulfillments, descriptorfulfillments = AIF_obj.get_xAIF_arrays(aif_section=json_dict, + xaif_elements=['schemefulfillments', 'descriptorfulfillments']) participants = json_dict.get("participants", []) if isinstance(extended_json_aif['text'], dict): text = extended_json_aif['text']['txt'] @@ -60,13 +102,13 @@ def turninator_default(self, path_obj): speakers_and_turns = self.dialog_turns(text) if is_dialog and len(self.dialog_turns(text)) else self.monolog_text(text) if is_dialog and len(self.dialog_turns(text)): speakers_and_turns = self.dialog_turns(text) - nodes, locutions, participants, text_with_span, node_id, person_id = AIF.create_turn_entry( + nodes, locutions, participants, text_with_span, node_id, person_id = AIF_obj.create_turn_entry( nodes, node_id, person_id, text_with_span, speakers_and_turns, locutions, participants, is_dialog) else: if not is_dialog: logging.info(f'processing monolog text') speakers_and_turns = self.monolog_text(text) - nodes, locutions, participants, text_with_span, node_id, person_id = AIF.create_turn_entry( + nodes, locutions, participants, text_with_span, node_id, person_id = AIF_obj.create_turn_entry( nodes, node_id, person_id, text_with_span, speakers_and_turns, locutions, participants, is_dialog) return TurninatorOutput.format_output(nodes, edges, locutions, schemefulfillments, descriptorfulfillments, participants, @@ -83,7 +125,7 @@ def turninator_default(self, path_obj): text_with_span = "" nodes, edges, schemefulfillments, descriptorfulfillments, participants, locutions = [], [], [], [], [], [] speakers_and_turns = self.monolog_text(text) - nodes, locutions, participants, text_with_span, node_id, person_id = AIF.create_turn_entry( + nodes, locutions, participants, text_with_span, node_id, person_id = AIF_obj.create_turn_entry( nodes, node_id, person_id, text_with_span, speakers_and_turns, locutions, participants, False) return TurninatorOutput.format_output(nodes, edges, locutions, schemefulfillments, descriptorfulfillments, participants, OVA, text_with_span, aif, extended_json_aif) else: @@ -92,11 +134,15 @@ def turninator_default(self, path_obj): else: # Non-json data is treated as monolog node_id, person_id = 0, 0 - data = open(path).read() + "\n" + with open(self.f_name, 'r') as file: + data = file.read() + "\n" aif, json_aif, OVA = {}, {}, {} text_with_span = "" nodes, edges, schemefulfillments, descriptorfulfillments, participants, locutions = [], [], [], [], [], [] speakers_and_turns = self.monolog_text(data) - nodes, locutions, participants, text_with_span, node_id, person_id = AIF.create_turn_entry( + nodes, locutions, participants, text_with_span, node_id, person_id = AIF_obj.create_turn_entry( nodes, node_id, person_id, text_with_span, speakers_and_turns, locutions, participants, False) return TurninatorOutput.format_output(nodes, edges, locutions, schemefulfillments, descriptorfulfillments, participants, OVA, text_with_span,aif, json_aif) + + +