From e65a4775667c409706e391a381644551a77033cd Mon Sep 17 00:00:00 2001 From: "longbin.lailb" Date: Thu, 9 Jan 2025 20:09:11 +0800 Subject: [PATCH] fix fetch paper new --- .../apps/paper_reading/paper_reading_nodes.py | 37 +++++++++++++------ python/graphy/config/workflow.json | 14 +++---- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/python/graphy/apps/paper_reading/paper_reading_nodes.py b/python/graphy/apps/paper_reading/paper_reading_nodes.py index 00ad756c..a7ee09b3 100644 --- a/python/graphy/apps/paper_reading/paper_reading_nodes.py +++ b/python/graphy/apps/paper_reading/paper_reading_nodes.py @@ -390,7 +390,7 @@ def from_dict( nodes_dict = {} nodes = [] edges = [] - start_node = "Paper" + start_node = "PaperNew" if "nodes" in graph_dict: for node in graph_dict["nodes"]: @@ -489,12 +489,19 @@ def run_through( data_id, current_node.name ) if persist_results: - logger.info(f"Found persisted data for node '{current_node.name}'") + logger.warning( + f"Found persisted data for node '{current_node.name}'" + ) # execute pdf extraction anyway if current_node.name == first_node.name: - next(current_node.execute(state)) - last_output = persist_results - is_persist = False + logger.warning( + f"For the first node '{current_node.name}'. Do Overwriting..." + ) + last_output = next(current_node.execute(state)) + is_persist = True + else: + last_output = persist_results + is_persist = False else: # Execute the current node output_generator = current_node.execute(state) @@ -596,6 +603,8 @@ def execute( DataGenerator: The output data generator from the node. """ + logger.warning(f"================= START INSPECT ==============") + for input_data in input: logger.error(f"input data: {input_data}") paper_file_path = input_data.get("paper_file_path", None) @@ -607,8 +616,8 @@ def execute( ) if not paper_file_path: - logger.error("No 'paper_file_path' provided in input data.") - logger.error(f"create fake extractor {paper_meta_path}") + logger.warning("No 'paper_file_path' provided in input data.") + logger.warning(f"create fake extractor {paper_meta_path}") if not paper_meta_path: continue try: @@ -629,11 +638,16 @@ def execute( data_id = process_id(base_name) pdf_extractor.set_img_path(f"{WF_IMAGE_DIR}/{data_id}") - first_node_name = self.graph.get_first_node_name() + first_node = self.graph.get_first_node() + first_node_name = first_node.name + if first_node: + first_node_name = first_node.name + else: + raise ValueError("No first node found in the graph.") - all_nodes = self.graph.get_node_names() - persist_states = self.persist_store.get_states(data_id, all_nodes) - if len(all_nodes) == len(persist_states): + all_nodes = set(self.graph.get_node_names()) + persist_states = set(self.persist_store.get_total_states(data_id)) + if persist_states == all_nodes: # This means that the data has already processed logger.info(f"Input with ID '{data_id}' already processed.") self.progress["total"].add( @@ -692,6 +706,7 @@ def execute( except Exception as e: logger.error(f"Error processing the paper: {e}") + traceback.print_exc() # clean state if data_id and data_id in state: state[data_id][WF_STATE_CACHE_KEY].clear() diff --git a/python/graphy/config/workflow.json b/python/graphy/config/workflow.json index 32b43d65..749c9207 100644 --- a/python/graphy/config/workflow.json +++ b/python/graphy/config/workflow.json @@ -7,7 +7,7 @@ "graph": { "nodes": [ { - "name": "Paper" + "name": "PaperNew" }, { "name": "Topic", @@ -185,15 +185,15 @@ ], "edges": [ { - "source": "Paper", + "source": "PaperNew", "target": "Topic" }, { - "source": "Paper", + "source": "PaperNew", "target": "Background" }, { - "source": "Paper", + "source": "PaperNew", "target": "Contribution" }, { @@ -205,15 +205,15 @@ "target": "Solution" }, { - "source": "Paper", + "source": "PaperNew", "target": "Experiment" }, { - "source": "Paper", + "source": "PaperNew", "target": "Baseline" }, { - "source": "Paper", + "source": "PaperNew", "target": "Github" } ]