diff --git a/tests/taxonomy.py b/tests/taxonomy.py index f2b6cb47..227c2534 100644 --- a/tests/taxonomy.py +++ b/tests/taxonomy.py @@ -2,11 +2,12 @@ # Standard from pathlib import Path -from typing import List +from typing import Any, Dict, List import shutil # Third Party import git +import yaml class MockTaxonomy: @@ -25,12 +26,12 @@ def untracked_files(self) -> List[str]: """List untracked files in the repository""" return self._repo.untracked_files - def create_untracked(self, rel_path: str, contents: str) -> Path: + def create_untracked(self, rel_path: str, contents: Dict[str, Any]) -> Path: """Create a new untracked file in the repository. Args: rel_path (str): Relative path (from repository root) to the file. - contents (str): String to be written to the file. + contents (Dict[str, Any]): Object to be written to the file. Returns: file_path: The path to the created file. """ @@ -38,15 +39,16 @@ def create_untracked(self, rel_path: str, contents: str) -> Path: assert not taxonomy_path.is_absolute() file_path = self.root.joinpath(taxonomy_path) file_path.parent.mkdir(exist_ok=True, parents=True) - file_path.write_text(contents, encoding="utf-8") + with file_path.open(mode="w", encoding="utf-8") as fp: + yaml.dump(contents, fp) return file_path - def add_tracked(self, rel_path, contents: str) -> Path: - """Add a new tracked file to the repository (and commits it). + def add_tracked(self, rel_path, contents: Dict[str, Any]) -> Path: + """Add a new tracked file to the repository (and commit it). Args: rel_path (str): Relative path (from repository root) to the file. - contents (str): String to be written to the file. + contents (Dict[str, Any]): Object to be written to the file. Returns: file_path: The path to the added file. """ diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index 33d21e8f..98b87780 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -3,12 +3,462 @@ """ # Standard -from unittest import mock +from contextlib import contextmanager +from typing import Any, Dict, Union +from unittest.mock import MagicMock, patch +import glob +import json +import os +import shutil +import tempfile +import unittest + +# Third Party +from datasets import load_dataset +import pytest +import yaml # First Party -from instructlab.sdg.generate_data import _context_init +from instructlab.sdg.generate_data import _SYS_PROMPT, _context_init, generate_data +from instructlab.sdg.llmblock import LLMBlock from instructlab.sdg.pipeline import PipelineContext +TEST_TAXONOMY_BASE = "main" + +TEST_CUSTOM_YAML_RULES = b"""extends: relaxed +rules: + line-length: + max: 180 +""" + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") + +NUM_INSTRUCTIONS_TO_GENERATE = 10 + + +def validate_legacy_dataset(dataset_file_name, expected_samples): + """Test dataset in the "legacy message sample" format. + + See LegacyMessageSample in instructlab/instructlab. + + system: str + user: str + assistant: str + + This is what is currently used by the legacy training methods such as Linux training and MacOS training. + """ + ds = load_dataset("json", data_files=dataset_file_name, split="train") + features = ["system", "user", "assistant"] + assert len(ds.features) == len(features) + for feature in features: + assert feature in ds.features + assert ds.features[feature].dtype == "string" + + for idx, sample in enumerate(expected_samples): + assert ds[idx]["system"] == _SYS_PROMPT + assert ds[idx]["user"] == sample["user"] + assert ds[idx]["assistant"] == sample["assistant"] + + +def validate_messages_dataset(dataset_file_name, expected_samples): + """Test dataset in the Hugging Face messages format + + See MessageSample in instructlab/instructlab. + + messages: + content: str + # one of: "user", "assistant", or "system" + role: str + """ + ds = load_dataset("json", data_files=dataset_file_name, split="train") + assert len(ds.features) == 2 + assert len(ds.features["messages"]) == 1 + assert len(ds.features["messages"][0]) == 2 + assert ds.features["messages"][0]["content"].dtype == "string" + assert ds.features["messages"][0]["role"].dtype == "string" + assert ds.features["metadata"].dtype == "string" + + for idx, sample in enumerate(expected_samples): + assert len(ds[idx]["messages"]) == 2 + assert ds[idx]["messages"][0]["role"] == "user" + assert ds[idx]["messages"][0]["content"] == sample["user"] + assert ds[idx]["messages"][1]["role"] == "assistant" + assert ds[idx]["messages"][1]["content"] == sample["assistant"] + assert ds[idx]["metadata"] == json.dumps({"system": _SYS_PROMPT}) + + +def validate_skill_leaf_node_dataset(dataset_file_name): + ds = load_dataset("json", data_files=dataset_file_name, split="train") + assert len(ds.features) == 7 + features = [ + "task_description", + "seed_context", + "seed_question", + "seed_response", + "output", + "id", + ] + for feature in features: + assert feature in ds.features + assert ds.features[feature].dtype == "string" + assert "messages" in ds.features + assert len(ds.features["messages"]) == 1 + assert len(ds.features["messages"][0]) == 2 + assert ds.features["messages"][0]["content"].dtype == "string" + assert ds.features["messages"][0]["role"].dtype == "string" + + +def validate_phase_leaf_node_dataset(dataset_file_name): + ds = load_dataset("json", data_files=dataset_file_name, split="train") + assert len(ds.features) == 3 + features = ["metadata", "id"] + for feature in features: + assert feature in ds.features + assert ds.features[feature].dtype == "string" + assert "messages" in ds.features + assert len(ds.features["messages"]) == 1 + assert len(ds.features["messages"][0]) == 2 + assert ds.features["messages"][0]["content"].dtype == "string" + assert ds.features["messages"][0]["role"].dtype == "string" + + +def validate_recipe(recipe_file_name): + with open(recipe_file_name, encoding="utf-8") as fp: + yaml_contents = yaml.safe_load(fp) + assert len(yaml_contents["datasets"]) == 1 + assert yaml_contents["datasets"][0]["path"].endswith(".jsonl") + assert "sampling_size" in yaml_contents["datasets"][0] + assert yaml_contents["metadata"]["sys_prompt"] == _SYS_PROMPT + + +def validate_mixed_dataset(dataset_file_name): + ds = load_dataset("json", data_files=dataset_file_name, split="train") + assert "messages" in ds.features + assert len(ds.features["messages"]) == 1 + assert len(ds.features["messages"][0]) == 2 + assert ds.features["messages"][0]["content"].dtype == "string" + assert ds.features["messages"][0]["role"].dtype == "string" + + +def validate_lm_eval_task(lm_eval_task_file_name): + with open(lm_eval_task_file_name, encoding="utf-8") as fp: + yaml_contents = yaml.safe_load(fp) + assert "task" in yaml_contents + assert "dataset_kwargs" in yaml_contents + assert "doc_to_text" in yaml_contents + assert "doc_to_choice" in yaml_contents + assert "doc_to_target" in yaml_contents + + +def validate_mmlubench_dataset(dataset_file_name): + with open(dataset_file_name, encoding="utf-8") as fp: + # FIXME: fix the mmlubench pipeline in this test + assert fp.readlines() == [] + + +def generate_test_samples(yaml_contents): + """Convert questions and answers from the taxonomy format into the + user/assistant format used by the legacy training methods such as + Linux training and MacOS training. + + This mirrors what _gen_test_data() does. + """ + test_samples = [] + is_knowledge = "document" in yaml_contents + for seed_example in yaml_contents["seed_examples"]: + if is_knowledge: + for qna in seed_example["questions_and_answers"]: + test_samples.append( + { + "user": qna["question"] + + "\n" + + seed_example["context"].strip(), + "assistant": qna["answer"].strip(), + } + ) + + else: + # FIXME: handle freeform skills - no context + test_samples.append( + { + "user": seed_example["question"] + "\n" + seed_example["context"], + "assistant": seed_example["answer"], + } + ) + return test_samples + + +def generate_train_samples(yaml_contents): + """Generate expected training samples in the user/assistant format + used by the legacy training methods such as Linux training and MacOS + training. + + Mirroring _noop_llmblock_generate() below, we generate 10 samples + per input, and then follow _gen_train_data()'s output format. + """ + + def add_question_mark(q): + return (q + "?") if not "?" in q else q + + train_samples = [] + is_knowledge = "document" in yaml_contents + for seed_example in yaml_contents["seed_examples"]: + for i in range(NUM_INSTRUCTIONS_TO_GENERATE): + if is_knowledge: + train_samples.append( + { + "user": seed_example["context"] + + f" (q{i}) " + + add_question_mark( + seed_example["questions_and_answers"][0]["question"].strip() + ), + "assistant": f"(a{i}) " + + seed_example["questions_and_answers"][0]["answer"].strip(), + } + ) + else: + # FIXME: handle freeform skills - no context + train_samples.append( + { + "user": seed_example["context"] + + f" (q{i}) " + + add_question_mark(seed_example["question"]), + "assistant": f"(a{i}) " + seed_example["answer"], + } + ) + return train_samples + + +def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]: + with open(skills_file_path, "r", encoding="utf-8") as skills_file: + return yaml.safe_load(skills_file) + + +def _noop_llmblock_generate(self, samples): + """Generate mock output based on input samples. + + Simply return the seed question and response from the input sample, + joined using '?' and with an integer discriminator. + + _get_question_hack() and _get_response_hack() is the code that later + splits these using the '?' separator. + + Return 10 output samples per input samples, since the LLMBlock in the + simple pipeline is configured with 'n: scaled' and we pass + num_instructions_to_generate=10 to generate_data. + """ + + def strip_q(q): + return q.strip().rstrip("?") + + output = [] + for sample in samples: + for i in range(NUM_INSTRUCTIONS_TO_GENERATE): + if "domain" in sample: # knowledge + output.append( + sample["icl_document"] + + f" (q{i}) " + + strip_q(sample["icl_query_1"]) + + f" ? (a{i}) " + + sample["icl_response_1"] + ) + else: + output.append( + sample["seed_context"] + + f" (q{i}) " + + strip_q(sample["seed_question"]) + + f" ? (a{i}) " + + sample["seed_response"] + ) + return output + + +@patch.object(LLMBlock, "_generate", _noop_llmblock_generate) +class TestGenerateCompositionalData(unittest.TestCase): + @pytest.fixture(autouse=True) + def _init_taxonomy(self, taxonomy_dir): + self.test_taxonomy = taxonomy_dir + + def setUp(self): + self.tmp_path = tempfile.TemporaryDirectory().name + test_valid_compositional_skill_file = os.path.join( + TEST_DATA_DIR, "test_valid_compositional_skill.yaml" + ) + tracked_compositional_file = os.path.join( + "compositional_skills", "tracked", "qna.yaml" + ) + untracked_compositional_file = os.path.join( + "compositional_skills", "new", "qna.yaml" + ) + test_valid_compositional_skill = load_test_skills( + test_valid_compositional_skill_file + ) + self.test_taxonomy.add_tracked( + tracked_compositional_file, test_valid_compositional_skill + ) + self.test_taxonomy.create_untracked( + untracked_compositional_file, test_valid_compositional_skill + ) + self.expected_test_samples = generate_test_samples( + test_valid_compositional_skill + ) + self.expected_train_samples = generate_train_samples( + test_valid_compositional_skill + ) + + def test_generate(self): + with patch("logging.Logger.info") as mocked_logger: + generate_data( + mocked_logger, + model_family="merlinite", + model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + num_instructions_to_generate=10, + taxonomy=self.test_taxonomy.root, + taxonomy_base=TEST_TAXONOMY_BASE, + output_dir=self.tmp_path, + yaml_rules=TEST_CUSTOM_YAML_RULES, + client=MagicMock(), + pipeline="simple", + ) + + for name in ["test_*.jsonl", "train_*.jsonl", "messages_*.jsonl"]: + matches = glob.glob(os.path.join(self.tmp_path, name)) + assert len(matches) == 1 + if name.startswith("test_"): + validate_legacy_dataset(matches[0], self.expected_test_samples) + elif name.startswith("train_"): + validate_legacy_dataset(matches[0], self.expected_train_samples) + elif name.startswith("messages_"): + validate_messages_dataset(matches[0], self.expected_train_samples) + + node_file = os.path.join("node_datasets_*", "compositional_skills_new.jsonl") + for name in [ + "skills_recipe_*.yaml", + "skills_train_msgs_*.jsonl", + node_file, + ]: + matches = glob.glob(os.path.join(self.tmp_path, name)) + assert len(matches) == 1 + if name.endswith("compositional_skills_new.jsonl"): + validate_skill_leaf_node_dataset(matches[0]) + elif name.startswith("skills_recipe_"): + validate_recipe(matches[0]) + elif name.startswith("skills_train_msgs_"): + validate_mixed_dataset(matches[0]) + + def teardown(self) -> None: + """Recursively remove the temporary repository and all of its + subdirectories and files. + """ + shutil.rmtree(self.tmp_path) + return + + def __enter__(self): + return self + + def __exit__(self, *args): + self.teardown() + + +@patch.object(LLMBlock, "_generate", _noop_llmblock_generate) +class TestGenerateKnowledgeData(unittest.TestCase): + @pytest.fixture(autouse=True) + def _init_taxonomy(self, taxonomy_dir): + self.test_taxonomy = taxonomy_dir + + def setUp(self): + self.tmp_path = tempfile.TemporaryDirectory().name + test_valid_knowledge_skill_file = os.path.join( + TEST_DATA_DIR, "test_valid_knowledge_skill.yaml" + ) + tracked_knowledge_file = os.path.join("knowledge ", "tracked", "qna.yaml") + untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml") + test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file) + self.test_taxonomy.add_tracked( + tracked_knowledge_file, test_valid_knowledge_skill + ) + self.test_taxonomy.create_untracked( + untracked_knowledge_file, test_valid_knowledge_skill + ) + self.expected_test_samples = generate_test_samples(test_valid_knowledge_skill) + self.expected_train_samples = generate_train_samples(test_valid_knowledge_skill) + + def test_generate(self): + with patch("logging.Logger.info") as mocked_logger: + generate_data( + mocked_logger, + model_family="merlinite", + model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + num_instructions_to_generate=10, + taxonomy=self.test_taxonomy.root, + taxonomy_base=TEST_TAXONOMY_BASE, + output_dir=self.tmp_path, + yaml_rules=TEST_CUSTOM_YAML_RULES, + chunk_word_count=1000, + server_ctx_size=4096, + client=MagicMock(), + pipeline="simple", + ) + + for name in ["test_*.jsonl", "train_*.jsonl", "messages_*.jsonl"]: + matches = glob.glob(os.path.join(self.tmp_path, name)) + assert len(matches) == 1 + if name.startswith("test_"): + validate_legacy_dataset(matches[0], self.expected_test_samples) + elif name.startswith("train_"): + validate_legacy_dataset(matches[0], self.expected_train_samples) + elif name.startswith("messages_"): + validate_messages_dataset(matches[0], self.expected_train_samples) + + node_p07_file = os.path.join("node_datasets_*", "knowledge_new_p07.jsonl") + node_p10_file = os.path.join("node_datasets_*", "knowledge_new_p10.jsonl") + for name in [ + "skills_recipe_*.yaml", + "skills_train_*.jsonl", + "knowledge_recipe_*.yaml", + "knowledge_train_msgs_*.jsonl", + node_p07_file, + node_p10_file, + ]: + matches = glob.glob(os.path.join(self.tmp_path, name)) + assert len(matches) == 1 + if name.endswith("knowledge_new_p07.jsonl") or name.endswith( + "knowledge_new_p10.jsonl" + ): + validate_phase_leaf_node_dataset(matches[0]) + elif name.startswith("skills_recipe_") or name.startswith( + "knowledge_recipe_" + ): + validate_recipe(matches[0]) + elif name.startswith("skills_train_msgs_") or name.startswith( + "knowledge_train_msgs_" + ): + validate_mixed_dataset(matches[0]) + + for name in [ + "knowledge_new_task.yaml", + "mmlubench_knowledge_new.jsonl", + ]: + matches = glob.glob(os.path.join(self.tmp_path, "node_datasets_*", name)) + assert len(matches) == 1 + if name == "knowledge_new_task.yaml": + validate_lm_eval_task(matches[0]) + elif name == "mmlubench_knowledge_new.jsonl": + validate_mmlubench_dataset(matches[0]) + + def teardown(self) -> None: + """Recursively remove the temporary repository and all of its + subdirectories and files. + """ + shutil.rmtree(self.tmp_path) + return + + def __enter__(self): + return self + + def __exit__(self, *args): + self.teardown() + def test_context_init_batch_size_optional(): """Test that the _context_init function can handle a missing batch size by diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index e8aa4bda..8c148113 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -1,36 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +from typing import Any, Dict, Union import os import pathlib # Third Party import pytest +import yaml # First Party from instructlab.sdg.utils import taxonomy -TEST_VALID_COMPOSITIONAL_SKILL_YAML = """created_by: rafael-vasquez -version: 1 -seed_examples: -- answer: "Sure thing!" - context: "This is a valid YAML." - question: "Can you help me debug this failing unit test?" -- answer: "answer2" - context: "context2" - question: "question2" -- answer: "answer3" - context: "context3" - question: "question3" -- answer: "answer4" - context: "context4" - question: "question4" -- answer: "answer5" - context: "context5" - question: "question5" -task_description: 'This is a task' -""" - TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?" TEST_CUSTOM_YAML_RULES = b"""extends: relaxed @@ -40,6 +21,13 @@ max: 180 """ +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") + + +def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]: + with open(skills_file_path, "r", encoding="utf-8") as skills_file: + return yaml.safe_load(skills_file) + class TestTaxonomy: """Test taxonomy in instructlab.sdg.utils.taxonomy.""" @@ -80,17 +68,18 @@ def test_read_taxonomy_leaf_nodes( ): tracked_file = "compositional_skills/tracked/qna.yaml" untracked_file = "compositional_skills/new/qna.yaml" + test_compositional_skill_file = os.path.join( + TEST_DATA_DIR, "test_valid_compositional_skill.yaml" + ) + test_compositional_skill = load_test_skills(test_compositional_skill_file) if create_tracked_file: - self.taxonomy.add_tracked(tracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML) + self.taxonomy.add_tracked(tracked_file, test_compositional_skill) if create_untracked_file: - self.taxonomy.create_untracked( - untracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML - ) + self.taxonomy.create_untracked(untracked_file, test_compositional_skill) leaf_nodes = taxonomy.read_taxonomy_leaf_nodes( self.taxonomy.root, taxonomy_base, TEST_CUSTOM_YAML_RULES ) - assert len(leaf_nodes) == len(check_leaf_node_keys) for leaf_node_key in check_leaf_node_keys: diff --git a/tests/testdata/test_valid_compositional_skill.yaml b/tests/testdata/test_valid_compositional_skill.yaml new file mode 100644 index 00000000..df4ad2c8 --- /dev/null +++ b/tests/testdata/test_valid_compositional_skill.yaml @@ -0,0 +1,19 @@ +created_by: rafael-vasquez +version: 1 +seed_examples: +- answer: "Sure thing!" + context: "This is a valid YAML." + question: "Can you help me debug this failing unit test?" +- answer: "answer2" + context: "context2" + question: "question2" +- answer: "answer3" + context: "context3" + question: "question3" +- answer: "answer4" + context: "context4" + question: "question4" +- answer: "answer5" + context: "context5" + question: "question5" +task_description: 'This is a task' \ No newline at end of file diff --git a/tests/testdata/test_valid_knowledge_skill.yaml b/tests/testdata/test_valid_knowledge_skill.yaml new file mode 100644 index 00000000..705acb41 --- /dev/null +++ b/tests/testdata/test_valid_knowledge_skill.yaml @@ -0,0 +1,176 @@ +created_by: lukeinglis +domain: anatomy_tonsil +version: 3 +seed_examples: + - context: | + ## Structure + Humans are born with four types of tonsils: the pharyngeal tonsil, two + tubal tonsils, two palatine tonsils, and the lingual tonsils.[1] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + questions_and_answers: + - question: What is the location of the tubal tonsils? + answer: The location of the tubal tonsils is the roof of the pharynx. + - question: | + Compare the epithelial types, encapsulation, and presence of + crypts in the pharyngeal, tubal, and palatine tonsils according to the + table provided. + answer: | + The pharyngeal tonsil features ciliated pseudostratified columnar + epithelium and is incompletely encapsulated with small folds sometimes + described as crypts. The tubal tonsils also have ciliated + pseudostratified columnar epithelium but are not encapsulated and do + not possess crypts. In contrast, the palatine tonsils are covered with + stratified squamous epithelium, are fully encapsulated, and contain + multiple deep crypts. These structural differences are indicative of + their varied anatomical locations and potentially their distinct + functions within the immune system. + - question: What type of epithelium is found in the pharyngeal tonsil? + answer: | + The type of epithelium found in the pharyngeal tonsil is ciliated + pseudostratified columnar (respiratory epithelium). + + + - context: | + The **tonsils** are a set of [lymphoid](Lymphatic_system "wikilink") + organs facing into the aerodigestive tract, which is known as + [Waldeyer's tonsillar ring](Waldeyer's_tonsillar_ring "wikilink") and + consists of the [adenoid tonsil](adenoid "wikilink") (or pharyngeal + tonsil), two [tubal tonsils](tubal_tonsil "wikilink"), two [palatine + tonsils](palatine_tonsil "wikilink"), and the [lingual + tonsils](lingual_tonsil "wikilink"). These organs play an important role + in the immune system. + + questions_and_answers: + - question: What is the immune system's first line of defense? + answer: | + The tonsils are the immune system's first line of defense against + ingested or inhaled foreign pathogens. + - question: What is Waldeyer's tonsillar ring? + answer: | + Waldeyer's tonsillar ring is a set of lymphoid organs facing into the + aerodigestive tract, consisting of the adenoid tonsil, two tubal + tonsils, two palatine tonsils, and the lingual tonsils. + - question: How many tubal tonsils are part of Waldeyer's tonsillar ring? + answer: There are two tubal tonsils as part of Waldeyer's tonsillar ring. + + - context: | + The palatine tonsils tend to reach their largest size in [puberty](puberty + "wikilink"), and they gradually undergo [atrophy](atrophy "wikilink") + thereafter. However, they are largest relative to the diameter of the + throat in young children. In adults, each palatine tonsil normally + measures up to 2.5 cm in length, 2.0 cm in width and 1.2 cm in + thickness.[2] + + questions_and_answers: + - question: When do the palatine tonsils tend to reach their largest size? + answer: The palatine tonsils tend to reach their largest size in puberty. + - question: What are the typical dimensions of an adult palatine tonsil? + answer: | + In adults, each palatine tonsil normally measures up to 2.5 cm in + length, 2.0 cm in width, and 1.2 cm in thickness. + - question: How do the palatine tonsils change in size with age? + answer: | + The palatine tonsils tend to gradually undergo atrophy after puberty, + becoming smaller in size compared to their dimensions in young + children. + + - context: | + The tonsils are immunocompetent organs that serve as the immune system's + first line of defense against ingested or inhaled foreign pathogens, and + as such frequently engorge with blood to assist in immune responses to + common illnesses such as the common cold. The tonsils have on their + surface specialized antigen capture cells called [microfold + cells](microfold_cell "wikilink") (M cells) that allow for the uptake of + antigens produced by pathogens. These M cells then alert the B cells and T + cells in the tonsil that a pathogen is present and an immune response is + stimulated.[3] B cells are activated and proliferate in areas called + germinal centers in the tonsil. These germinal centers are places where B + memory cells are created and [secretory antibody (IgA)](Immunoglobulin_A + "wikilink") is produced. + + questions_and_answers: + - question: | + What are the specialized antigen capture cells on the surface of the + tonsils called? + answer: | + The specialized antigen capture cells on the surface of the tonsils + are called microfold cells (M cells). + - question: What is the role of microfold cells in the tonsils? + answer: | + Microfold cells (M cells) allow for the uptake of antigens produced by + pathogens. They alert the B cells and T cells in the tonsil that a + pathogen is present, stimulating an immune response. + - question: Where do B cells proliferate in the tonsils? + answer: B cells proliferate in areas called germinal centers in the tonsils. + + - context: | + A [tonsillolith](tonsillolith "wikilink") (also known as a "tonsil stone") + is material that accumulates on the palatine tonsil. This can reach the + size of a [peppercorn](peppercorn "wikilink") and is white or cream in + color. The main substance is mostly [calcium](calcium "wikilink"), but it + has a strong unpleasant odor because of [hydrogen + sulfide](hydrogen_sulfide "wikilink") and [methyl + mercaptan](methyl_mercaptan "wikilink") and other chemicals.[6] + + questions_and_answers: + - question: What is a tonsillolith? + answer: | + A tonsillolith (tonsil stone) is material that accumulates on the + palatine tonsil, reaching the size of a peppercorn and having a white + or cream color. It contains calcium and has a strong unpleasant odor + due to hydrogen sulfide, methyl mercaptan, and other chemicals. + - question: What is the main substance found in a tonsillolith? + answer: The main substance found in a tonsillolith is mostly calcium. + - question: Why do tonsilloliths have a strong unpleasant odor? + answer: | + Tonsilloliths have a strong unpleasant odor due to hydrogen sulfide, + methyl mercaptan, and other chemicals. + +document_outline: | + Overview of Human tonsils, describing their types, locations, structure, + function, and clinical significance, with a specific focus on their role in + the immune system and related health issues. + +document: + repo: https://github.com/luke-inglis/il-anatomy-knowledge + commit: cc7c6ca + patterns: + - anatomy1.md

Type

Epithelium

Capsule

Crypts

Location

Pharyngeal tonsil (also + termed "adenoid")

Ciliated + pseudostratified columnar (respiratory epithelium)

Incompletely encapsulated

Small folds—sometimes described as crypts1

Roof of pharynx

Tubal tonsils

Ciliated pseudostratified columnar (respiratory epithelium)

Not encapsulated

No crypts

Roof of pharynx

Palatine tonsils

Stratified squamous epithelium

Fully encapsulated

Multiple deep crypts

Each side of the throat at the back of the mouth