From ba884539d7946224b178cf579a3fa6e21f30fcc8 Mon Sep 17 00:00:00 2001
From: Traun Leyden <traun.leyden@gmail.com>
Date: Mon, 4 Dec 2023 14:22:56 +0100
Subject: [PATCH] Fixes #78 Reading comprehension synthetic data regex
 improvements

---
 .../reading_comprehension_generation/utils.py | 168 +++++++++++----
 .../test_utils.py                             | 197 ++++++++++++++++++
 2 files changed, 319 insertions(+), 46 deletions(-)
 create mode 100644 tests/datasets/reading_comprehension_generation/test_utils.py

diff --git a/dalm/datasets/reading_comprehension_generation/utils.py b/dalm/datasets/reading_comprehension_generation/utils.py
index af019d7..576c1c0 100644
--- a/dalm/datasets/reading_comprehension_generation/utils.py
+++ b/dalm/datasets/reading_comprehension_generation/utils.py
@@ -140,38 +140,59 @@ def create_domain_tokenizer_from_files(directory_or_file: str, csv_column: Optio
         return create_domain_tokenizer(os.path.join(temp_dir, "temp.txt"))
 
 
-def fix_first_prompt(text: str, chat_chain: List[Dict[str, str]]) -> List[Dict[str, str]]:
-    # remove the first prompt
-    first_prompt = chat_chain.pop(0)
-    fixed_first_prompt = [
-        {
-            "content": f"Based on the following text: \n {text}, \n I'd like you to answer a few questions\n"
-            + first_prompt["content"],
-            "role": "user",
-        }
-    ]
-    return fixed_first_prompt + chat_chain
+def wrap_context_with_rag_instruction(context: str) -> str:
+    return f"Based on the following text: \n {context}, \n I'd like you to answer a few questions\n"
 
 
-# TODO: add test
-# TODO: Address known issues described in #78
-def question_and_answer_extractor(whole_text: str, context: str) -> List[Dict[str, str]] | None:
-    text_lines = whole_text.split("\n")
-    question: List[str] = []
-    answer: List[str] = []
+def extract_question(text: str) -> Tuple[bool, str]:
+    """
+    Extracts a question from a line of text.
+    Returns a tuple of (is_question, question_text)
+    """
+    # question regex
+    return extract_question_or_answer(text, extract_type="question")
+
+def extract_answer(text: str) -> Tuple[bool, str]:
+    """
+    Extracts an answer from a line of text.
+    Returns a tuple of (is_answer, answer_text)
+    """
+    # question regex
+    return extract_question_or_answer(text, extract_type="answer")
 
-    question_context = False
-    answer_context = False
+def extract_question_or_answer(text: str, extract_type: str = "question") -> Tuple[bool, str]:
+
+    # Match a line that starts with any number of junk characters, followed by either "question:" 
+    # or "answer:", followed by any number of spaces (ignored), followed by any number of characters
+    # that will be captured in a group as the question or answer.
+    # extraction_regex = rf".*{extract_type}:\s*(.*)"
+
+    # Update above to handle the case where the question or answer is in brackets, with
+    # other text to be ignored inside the brackets
+    extraction_regex = rf".*\[?{extract_type}[:\]]*(?:.*?\])?\s*(.*)"
+
+    match = re.match(extraction_regex, text, re.IGNORECASE)
+    extracted_text = match.group(1) if match else None
+    found_extracted = True if extracted_text else False
+    return found_extracted, extracted_text
+    
+
+def _raw_question_and_answer_extractor(whole_text: str) -> List[Dict[str, str]] | None:
+    """
+    Loop over all lines in the text.  
+    When we find a question, capture the question into a variable and set a state flag
+    When we find an answer, capture the answer into a variable and save the QA pair
+    When we run out of lines, return the list of QA pairs
+    """
 
-    result = []
     task_regex = r"^\*?\*?task\s*\d*"
 
-    # question regex
-    question_regex = r"^question\s*\d*"
+    cur_qa_pair = {}
+    qa_pairs = []
 
-    # answer regex
-    answer_regex = r"^answer\s*\d*"
+    state = "waiting_for_question"  # waiting_for_question, waiting_for_answer
 
+    text_lines = whole_text.split("\n")
     for i in text_lines:
         raw_text = i.strip()
         text = raw_text.lower()
@@ -180,31 +201,86 @@ def question_and_answer_extractor(whole_text: str, context: str) -> List[Dict[st
         if text == "":
             continue
 
-        # if the line start matches the question regex or the task regex
-        if re.match(question_regex, text) or re.match(task_regex, text):
-            if answer_context:
-                result.append({"content": " ".join(question), "role": "user"})
-                result.append({"content": " ".join(answer), "role": "assistant"})
-                question = []
-                answer = []
-                answer_context = False
+        # If the line matches the task regex, print a warning.  The old code handled
+        # "tasks", but this new code does not.  Need to inspect where these come into play
+        if re.match(task_regex, text):
+            logger.warning(f"Found a task line: {text}")
+
+        if state == "waiting_for_question":
+            is_question, question_text = extract_question(text)
+            if is_question:
+                state = "waiting_for_answer"
+                cur_qa_pair = {"question": question_text, "answer": "TBD"}
+                continue
+        elif state == "waiting_for_answer":
+            is_answer, answer_text = extract_answer(text)
+            state = "waiting_for_question"
+            cur_qa_pair["answer"] = answer_text
+            qa_pairs.append(cur_qa_pair)
+            continue
+
+        else:
+            raise ValueError("Unknown state")   
+
+    return qa_pairs 
+
+
+def convert_qa_pairs_to_chat_completions(qa_pairs: List[Dict[str, str]]) -> List[Dict[str, str]]:
+    """
+    Convert a list of QA pairs into a list of chat completions that can be fed into the large language model.
+    """
+    chat_completions = []
+    for qa_pair in qa_pairs:
+        question = qa_pair["question"]
+        answer = qa_pair["answer"]
+
+        question_chat_completion = {
+            "content": question,
+            "role": "user",
+        }
+
+        answer_chat_completion = {
+            "content": answer,
+            "role": "assistant",
+        }
+
+        chat_completions.append(question_chat_completion)
+        chat_completions.append(answer_chat_completion)
+
+    return chat_completions
+
+def question_and_answer_extractor(whole_text: str, context: str) -> List[Dict[str, str]] | None:
+    """
+
+    Extracts questions and answers from the raw text generated by the large language model.
+
+    @param whole_text: the raw questions and answers generated by the large language model, eg: 
+                        "1. QUESTION: Can you summarize the .. ? 
+                        ANSWER: Population imaging studies generated .."
+    @param context: the full dataset text that was used to generate the questions and answers, eg:
+                        "Population imaging studies generate data for developing and implementing..." 
+
+    """
+
+    chat_completion_inputs = []
 
-            question_context = True
-            answer_context = False
+    # Wrap the context with a RAG instruction 
+    context_instruction = wrap_context_with_rag_instruction(context)
 
-        if re.match(answer_regex, text):
-            question_context = False
-            answer_context = True
+    # The first chat completion input is the context instruction
+    first_chat_completion_input = {
+        "content": context_instruction,
+        "role": "user",
+    }
+    chat_completion_inputs.append(first_chat_completion_input)
 
-        if question_context:
-            # remove (labelled as QUESTION and ANSWER) from the text
-            raw_text = re.sub(r"\(labelled as QUESTION and ANSWER\)", "", raw_text)
-            question.append(raw_text)
+    # Extract the qa pairs from whole_text
+    qa_pairs = _raw_question_and_answer_extractor(whole_text)
 
-        if answer_context:
-            answer.append(raw_text)
+    # Convert the qa pairs to chat completion inputs
+    qa_pairs_chat_completions = convert_qa_pairs_to_chat_completions(qa_pairs)
 
-    if result == []:
-        return None
+    # Add the qa pairs chat completions to the result
+    chat_completion_inputs.extend(qa_pairs_chat_completions)
 
-    return fix_first_prompt(context, result)
+    return chat_completion_inputs
\ No newline at end of file
diff --git a/tests/datasets/reading_comprehension_generation/test_utils.py b/tests/datasets/reading_comprehension_generation/test_utils.py
new file mode 100644
index 0000000..e485ada
--- /dev/null
+++ b/tests/datasets/reading_comprehension_generation/test_utils.py
@@ -0,0 +1,197 @@
+from dalm.datasets.reading_comprehension_generation.utils import _raw_question_and_answer_extractor, question_and_answer_extractor
+import pdb
+
+def test_question_and_answer_extractor():
+    chat_completions = question_and_answer_extractor(
+        whole_text=""" 
+1. QUESTION: Can you summarize the purpose of population imaging studies and how they contribute to preventing or treating disease?
+ANSWER: Population imaging studies generate data for developing and implementing personalized health strategies to prevent or more effectively treat disease. These studies acquire imaging for pre-symptomatic populations to discover alterations due to impending disease and identify individuals at risk, which enables early intervention.
+
+2. QUESTION: How does the UK Biobank study stand out in terms of size and availability of expert annotation?
+ANSWER: The UK Biobank study stands out for its sheer size, careful implementation, and availability of top quality expert annotation. The resulting massive imaging datasets targeting around 100,000 subjects have posed new challenges requiring automatic image analysis, and this study has put published approaches for cardiac image quantification to the test.
+
+3. QUESTION: What does the proposed cardiac magnetic resonance (CMR) image analysis pipeline do, and how does it differ from previous published approaches for cardiac image quantification?
+ANSWER: The proposed CMR image analysis pipeline performs end-to-end image analytics from multi-view cine CMR images all the way to anatomical and functional bi-ventricular quantification without manual user interactions. It provides fully automated extraction of global and regional reference ranges of all key functional cardiovascular indexes from both left and right cardiac ventricles for a population of 20,000 subjects imaged at 50 time frames per subject, for a total of one million CMR volumes. This is the first published attempt to fully automate the extraction of global and regional reference ranges of all key functional cardiovascular indexes for such a large population.
+
+4. QUESTION: How does the proposed CMR analytics pipeline compare in terms of segmentation accuracy with respect to human experts, and what are the results of its validation against manual expert readings on a reference cohort of 4620 subjects?
+ANSWER: The proposed pipeline shows broad significant agreement between the manually obtained reference indexes and those automatically computed via the framework. Around 80.67% of subjects were processed with mean contour distance of less than 1 pixel, and around 17.50% with mean contour distance between 1 and 2 pixels. The comparison with a recently published approach reporting on UKB data, which is based on deep learning, shows similar performance in terms of segmentation accuracy with respect to human experts.
+        """,
+        context=""" 
+Population imaging studies generate data for developing and implementing personalised health strategies to prevent, or more effectively treat disease. Large prospective epidemiological studies acquire imaging for pre-symptomatic populations. These studies enable the early discovery of alterations due to impending disease, and enable early identification of individuals at risk. Such studies pose new challenges requiring automatic image analysis. To date, few large-scale population-level cardiac imaging studies have been conducted. One such study stands out for its sheer size, careful implementation, and availability of top quality expert annotation; the UK Biobank (UKB). The resulting massive imaging datasets (targeting ca. 100,000 subjects) has put published approaches for cardiac image quantification to the test. In this paper, we present and evaluate a cardiac magnetic resonance (CMR) image analysis pipeline that properly scales up and can provide a fully automatic analysis of the UKB CMR study. Without manual user interactions, our pipeline performs end-to-end image analytics from multi-view cine CMR images all the way to anatomical and functional bi-ventricular quantification. All this, while maintaining relevant quality controls of the CMR input images, and resulting image segmentations. To the best of our knowledge, this is the first published attempt to fully automate the extraction of global and regional reference ranges of all key functional cardiovascular indexes, from both left and right cardiac ventricles, for a population of 20,000 subjects imaged at 50 time frames per subject, for a total of one million CMR volumes. In addition, our pipeline provides 3D anatomical bi-ventricular models of the heart. These models enable the extraction of detailed information of the morphodynamics of the two ventricles for subsequent association to genetic, omics, lifestyle habits, exposure information, and other information provided in population imaging studies. We validated our proposed CMR analytics pipeline against manual expert readings on a reference cohort of 4620 subjects with contour delineations and corresponding clinical indexes. Our results show broad significant agreement between the manually obtained reference indexes, and those automatically computed via our framework. 80.67% of subjects were processed with mean contour distance of less than 1 pixel, and 17.50% with mean contour distance between 1 and 2 pixels. Finally, we compare our pipeline with a recently published approach reporting on UKB data, and based on deep learning. Our comparison shows similar performance in terms of segmentation accuracy with respect to human experts.
+        """,
+    )
+    print(chat_completions) 
+
+    # The first chat completion item should be a user prompt, and it should start with "Based on the following text:"
+    assert chat_completions[0]["content"].startswith("Based on the following text:")
+    assert chat_completions[0]["role"] == "user"
+
+    # We should have 9 chat completion items for this input
+    assert len(chat_completions) == 9
+
+    # Now it should alternate between user and assistant roles.
+    # Odd numbers should be user prompts, and even numbers should be assistant responses.
+    for i, chat_completion in enumerate(chat_completions):
+        
+        # Skip the first row, which was already verified as having a user role above.
+        if i == 0:
+            continue
+        
+        if i % 2 == 0:
+            assert chat_completion["role"] == "assistant"
+        else:
+            assert chat_completion["role"] == "user"
+
+
+
+
+
+def test_raw_question_and_answer_extractor():
+
+    inputs = [
+        {
+            "whole_text": """
+                            QUESTION: What is the focus?
+                            ANSWER: The focus is health strategies.
+
+                            QUESTION: What is unique about the UK?
+                            ANSWER: The UK Biobank (UKB).
+
+                            QUESTION: What is the focus of the proposed?
+                            ANSWER: The focus of the proposed CMR image imaging studies.
+
+                            QUESTION: How was the proposed CMR analytics?
+                            ANSWER: The proposed CMR analytics pipeline was validated.""",
+            "expected_output": [
+                {
+                    "question": "What is the focus?",
+                    "answer": "The focus is health strategies."
+                },
+                {
+                    "question": "What is unique about the UK?",
+                    "answer": "The UK Biobank (UKB)."
+                },
+                {
+                    "question": "What is the focus of the proposed?",
+                    "answer": "The focus of the proposed CMR image imaging studies."
+                },
+                {
+                    "question": "How was the proposed CMR analytics?",
+                    "answer": "The proposed CMR analytics pipeline was validated."
+                }
+            ]
+        },        
+        {
+            "whole_text": """1. QUESTION: What are thoracic diseases?
+                                ANSWER: Thoracic diseases refer to health problems.
+                                
+                                2. QUESTION: How is chest X-ray currently?
+                                ANSWER: Chest X-ray is currently one.
+                                
+                                3. QUESTION: Why is reading chest X-ray images?
+                                ANSWER: Reading chest X-ray images.
+                                
+                                4. QUESTION: What is the proposed solution?
+                                ANSWER: To make a deep architecture.""",
+            "expected_output": [
+                {
+                    "question": "What are thoracic diseases?",
+                    "answer": "Thoracic diseases refer to health problems."
+                },
+                {
+                    "question": "How is chest X-ray currently?",
+                    "answer": "Chest X-ray is currently one."
+                },
+                {
+                    "question": "Why is reading chest X-ray images?",
+                    "answer": "Reading chest X-ray images."
+                },
+                {
+                    "question": "What is the proposed solution?",
+                    "answer": "To make a deep architecture."
+                }
+            ]
+        },
+        {
+            "whole_text": """1. [QUESTION:] What are thoracic diseases?
+                                [ANSWER:] Thoracic diseases refer to health problems.
+                                
+                                2. [QUESTION:] How is chest X-ray currently?
+                                [ANSWER:] Chest X-ray is currently one.
+                                
+                                3. [QUESTION:] Why is reading chest X-ray images?
+                                [ANSWER:] Reading chest X-ray images .
+                                
+                                4. [QUESTION:] What is the proposed solution?
+                                [ANSWER:] To make a deep architecture.""",
+            "expected_output": [
+                {
+                    "question": "What are thoracic diseases?",
+                    "answer": "Thoracic diseases refer to health problems."
+                },
+                {
+                    "question": "How is chest X-ray currently?",
+                    "answer": "Chest X-ray is currently one."
+                },
+                {
+                    "question": "Why is reading chest X-ray images?",
+                    "answer": "Reading chest X-ray images ."
+                },
+                {
+                    "question": "What is the proposed solution?",
+                    "answer": "To make a deep architecture."
+                }
+            ]
+        },
+        {
+            "whole_text": """ 1. [QUESTION: Complete-the-sentence Q&A] What are thoracic diseases?
+                                ANSWER: Thoracic diseases refer to health problems.
+                                
+                                2. [QUESTION: True/false Q&A] How is chest X-ray currently?
+                                ANSWER: Chest X-ray is currently one.""",
+            "expected_output": [
+                {
+                    "question": "What are thoracic diseases?",
+                    "answer": "Thoracic diseases refer to health problems."
+                },
+                {
+                    "question": "How is chest X-ray currently?",
+                    "answer": "Chest X-ray is currently one."
+                }
+            ]
+        },
+        {
+            "whole_text": """1. Question (type: normal q&a): What are thoracic diseases?
+                                Answer: Thoracic diseases refer to health problems.
+                                
+                               2. Question (type: complete-the-sentence): How is chest X-ray currently?
+                                Answer: Chest X-ray is currently one.
+                                """,
+            "expected_output": [
+                {
+                    "question": "(type: normal q&a): What are thoracic diseases?",
+                    "answer": "Thoracic diseases refer to health problems."
+                },
+                {
+                    "question": "(type: complete-the-sentence): How is chest X-ray currently?",
+                    "answer": "Chest X-ray is currently one."
+                }
+            ]
+        },
+
+
+    ]
+
+    for input in inputs:
+        result_qa_pairs = _raw_question_and_answer_extractor(whole_text=input["whole_text"])
+        expected_qa_pairs = input["expected_output"]
+        for result, expected in zip(result_qa_pairs, expected_qa_pairs):
+            result_question = result["question"].strip().lower()
+            expected_question = expected["question"].strip().lower()
+            result_answer = result["answer"].strip().lower()
+            expected_answer = expected["answer"].strip().lower()
+            assert result_question == expected_question, f"result_question: {result_question} != expected_question: {expected_question}"
+            assert result_answer == expected_answer, f"result_answer: {result_answer} != expected_answer: {expected_answer}"
+        
+
+