diff --git a/README.md b/README.md
index d26f0b17..e6cba746 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,8 @@ that includes it. Once it reaches the end of its lifespan, the experiment will b
The latest version of the package contains the following experiments:
+
+=======
| Name | Type | Expected End Date | Dependencies | Cookbook | Discussion |
| --------------------------- | -------------------------- | ---------------------------- | ------------ | -------- | ---------- |
| [`EvaluationHarness`][1] | Evaluation orchestrator | October 2024 | None | | [Discuss](https://github.com/deepset-ai/haystack-experimental/discussions/74) |
@@ -46,7 +48,7 @@ The latest version of the package contains the following experiments:
| [`ChatMessageRetriever`][6] | Memory Component | December 2024 | None | | [Discuss](https://github.com/deepset-ai/haystack-experimental/discussions/75) |
| [`InMemoryChatMessageStore`][7] | Memory Store | December 2024 | None | | [Discuss](https://github.com/deepset-ai/haystack-experimental/discussions/75) |
| [`Auto-Merging Retriever`][8] & [`HierarchicalDocumentSplitter`][9]| Document Splitting & Retrieval Technique | December 2024 | None | | [Discuss](https://github.com/deepset-ai/haystack-experimental/discussions/78) |
-
+| [`LLMetadataExtractor`][13] | Metadata extraction with LLM | December 2024 | None | | |
[1]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/evaluation/harness
[2]: https://github.com/deepset-ai/haystack-experimental/tree/main/haystack_experimental/components/tools/openai
@@ -60,7 +62,7 @@ The latest version of the package contains the following experiments:
[10]: https://github.com/deepset-ai/haystack-experimental/blob/main/haystack_experimental/dataclasses/chat_message.py
[11]: https://github.com/deepset-ai/haystack-experimental/blob/main/haystack_experimental/components/generators/chat/openai.py
[12]: https://github.com/deepset-ai/haystack-experimental/blob/main/haystack_experimental/components/tools/tool_invoker.py
-
+[13]: https://github.com/deepset-ai/haystack-experimental/blob/main/haystack_experimental/components/extractors/llm_metadata_extractor.py
## Usage
diff --git a/docs/pydoc/config/extractors_api.yml b/docs/pydoc/config/extractors_api.yml
new file mode 100644
index 00000000..d8dbdd48
--- /dev/null
+++ b/docs/pydoc/config/extractors_api.yml
@@ -0,0 +1,27 @@
+loaders:
+ - type: haystack_pydoc_tools.loaders.CustomPythonLoader
+ search_path: [../../../]
+ modules: ["haystack_experimental.components.extractors.llm_metadata_extractor"]
+ ignore_when_discovered: ["__init__"]
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
+ excerpt: Extracting information from documents.
+ category_slug: experiments-api
+ title: Extractors
+ slug: experimental-generators-api
+ order: 50
+ markdown:
+ descriptive_class_title: false
+ classdef_code_block: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: experimental_extractors.md
diff --git a/haystack_experimental/components/__init__.py b/haystack_experimental/components/__init__.py
index 75afcfb1..82324492 100644
--- a/haystack_experimental/components/__init__.py
+++ b/haystack_experimental/components/__init__.py
@@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: Apache-2.0
+
+from .extractors import LLMMetadataExtractor
from .generators.chat import OpenAIChatGenerator
from .retrievers.auto_merging_retriever import AutoMergingRetriever
from .retrievers.chat_message_retriever import ChatMessageRetriever
@@ -9,12 +11,12 @@
from .tools import OpenAIFunctionCaller, ToolInvoker
from .writers import ChatMessageWriter
-
_all_ = [
"AutoMergingRetriever",
"ChatMessageWriter",
"ChatMessageRetriever",
"OpenAIChatGenerator",
+ "LLMMetadataExtractor",
"HierarchicalDocumentSplitter",
"OpenAIFunctionCaller",
"ToolInvoker"
diff --git a/haystack_experimental/components/extractors/__init__.py b/haystack_experimental/components/extractors/__init__.py
new file mode 100644
index 00000000..d0f881df
--- /dev/null
+++ b/haystack_experimental/components/extractors/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from haystack_experimental.components.extractors.llm_metadata_extractor import LLMMetadataExtractor, LLMProvider
+
+_all_ = ["LLMMetadataExtractor", "LLMProvider"]
diff --git a/haystack_experimental/components/extractors/llm_metadata_extractor.py b/haystack_experimental/components/extractors/llm_metadata_extractor.py
new file mode 100644
index 00000000..8528e26e
--- /dev/null
+++ b/haystack_experimental/components/extractors/llm_metadata_extractor.py
@@ -0,0 +1,267 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+
+from haystack import Document, component, default_from_dict, default_to_dict, logging
+from haystack.components.builders import PromptBuilder
+from haystack.components.generators import AzureOpenAIGenerator, OpenAIGenerator
+from haystack.lazy_imports import LazyImport
+from haystack.utils import deserialize_secrets_inplace
+
+with LazyImport(message="Run 'pip install \"amazon-bedrock-haystack==1.0.2\"'") as amazon_bedrock_generator:
+ from haystack_integrations.components.generators.amazon_bedrock import AmazonBedrockGenerator
+
+with LazyImport(message="Run 'pip install \"google-vertex-haystack==2.0.0\"'") as vertex_ai_gemini_generator:
+ from haystack_integrations.components.generators.google_vertex import VertexAIGeminiGenerator
+
+
+logger = logging.getLogger(__name__)
+
+
+class LLMProvider(Enum):
+ """
+ Currently LLM providers supported by `LLMMetadataExtractor`.
+ """
+
+ OPENAI = "openai"
+ OPENAI_AZURE = "openai_azure"
+ AWS_BEDROCK = "aws_bedrock"
+ GOOGLE_VERTEX = "google_vertex"
+
+ @staticmethod
+ def from_str(string: str) -> "LLMProvider":
+ """
+ Convert a string to a LLMProvider enum.
+ """
+ provider_map = {e.value: e for e in LLMProvider}
+ provider = provider_map.get(string)
+ if provider is None:
+ msg = (
+ f"Invalid LLMProvider '{string}'"
+ f"Supported LLMProviders are: {list(provider_map.keys())}"
+ )
+ raise ValueError(msg)
+ return provider
+
+
+@component
+class LLMMetadataExtractor:
+ """
+ Extracts metadata from documents using a Large Language Model (LLM) from OpenAI.
+
+ The metadata is extracted by providing a prompt to n LLM that generates the metadata.
+
+ ```python
+ from haystack import Document
+ from haystack.components.generators import OpenAIGenerator
+ from haystack_experimental.components.extractors import LLMMetadataExtractor
+
+ NER_PROMPT = '''
+ -Goal-
+ Given text and a list of entity types, identify all entities of those types from the text.
+
+ -Steps-
+ 1. Identify all entities. For each identified entity, extract the following information:
+ - entity_name: Name of the entity, capitalized
+ - entity_type: One of the following types: [organization, product, service, industry]
+ Format each entity as {"entity": , "entity_type": }
+
+ 2. Return output in a single list with all the entities identified in steps 1.
+
+ -Examples-
+ ######################
+ Example 1:
+ entity_types: [organization, person, partnership, financial metric, product, service, industry, investment strategy, market trend]
+ text:
+ Another area of strength is our co-brand issuance. Visa is the primary network partner for eight of the top 10 co-brand partnerships in the US today and we are pleased that Visa has finalized a multi-year extension of our successful credit co-branded partnership with Alaska Airlines, a portfolio that benefits from a loyal customer base and high cross-border usage.
+ We have also had significant co-brand momentum in CEMEA. First, we launched a new co-brand card in partnership with Qatar Airways, British Airways and the National Bank of Kuwait. Second, we expanded our strong global Marriott relationship to launch Qatar's first hospitality co-branded card with Qatar Islamic Bank. Across the United Arab Emirates, we now have exclusive agreements with all the leading airlines marked by a recent agreement with Emirates Skywards.
+ And we also signed an inaugural Airline co-brand agreement in Morocco with Royal Air Maroc. Now newer digital issuers are equally
+ ------------------------
+ output:
+ {"entities": [{"entity": "Visa", "entity_type": "company"}, {"entity": "Alaska Airlines", "entity_type": "company"}, {"entity": "Qatar Airways", "entity_type": "company"}, {"entity": "British Airways", "entity_type": "company"}, {"entity": "National Bank of Kuwait", "entity_type": "company"}, {"entity": "Marriott", "entity_type": "company"}, {"entity": "Qatar Islamic Bank", "entity_type": "company"}, {"entity": "Emirates Skywards", "entity_type": "company"}, {"entity": "Royal Air Maroc", "entity_type": "company"}]}
+ #############################
+ -Real Data-
+ ######################
+ entity_types: [company, organization, person, country, product, service]
+ text: {{input_text}}
+ ######################
+ output:
+ '''
+
+ docs = [
+ Document(content="deepset was founded in 2018 in Berlin, and is known for its Haystack framework"),
+ Document(content="Hugging Face is a company founded in Paris, France and is known for its Transformers library")
+ ]
+
+ extractor = LLMMetadataExtractor(prompt=NER_PROMPT, expected_keys=["entities"], generator=OpenAIGenerator(), input_text='input_text')
+ extractor.run(documents=docs)
+ >> {'documents': [
+ Document(id=.., content: 'deepset was founded in 2018 in Berlin, and is known for its Haystack framework',
+ meta: {'entities': [{'entity': 'deepset', 'entity_type': 'company'}, {'entity': 'Berlin', 'entity_type': 'city'},
+ {'entity': 'Haystack', 'entity_type': 'product'}]}),
+ Document(id=.., content: 'Hugging Face is a company founded in Paris, France and is known for its Transformers library',
+ meta: {'entities': [
+ {'entity': 'Hugging Face', 'entity_type': 'company'}, {'entity': 'Paris', 'entity_type': 'city'},
+ {'entity': 'France', 'entity_type': 'country'}, {'entity': 'Transformers', 'entity_type': 'product'}
+ ]})
+ ]
+ }
+ >>
+ ```
+ """ # noqa: E501
+
+ def __init__( # pylint: disable=R0917
+ self,
+ prompt: str,
+ input_text: str,
+ expected_keys: List[str],
+ generator_api: Union[str,LLMProvider],
+ generator_api_params: Optional[Dict[str, Any]] = None,
+ raise_on_failure: bool = False,
+ ):
+ """
+ Initializes the LLMMetadataExtractor.
+
+ :param prompt: The prompt to be used for the LLM.
+ :param input_text: The input text to be processed by the PromptBuilder.
+ :param expected_keys: The keys expected in the JSON output from the LLM.
+ :param generator_api: The API provider for the LLM.
+ :param generator_api_params: The parameters for the LLM generator.
+ :param raise_on_failure: Whether to raise an error on failure to validate JSON output.
+ :returns:
+
+ """
+ self.prompt = prompt
+ self.input_text = input_text
+ self.builder = PromptBuilder(prompt, required_variables=[input_text])
+ self.raise_on_failure = raise_on_failure
+ self.expected_keys = expected_keys
+ self.generator_api = generator_api if isinstance(generator_api, LLMProvider)\
+ else LLMProvider.from_str(generator_api)
+ self.generator_api_params = generator_api_params or {}
+ self.llm_provider = self._init_generator(self.generator_api, self.generator_api_params)
+ if self.input_text not in self.prompt:
+ raise ValueError(f"Input text '{self.input_text}' must be in the prompt.")
+
+ @staticmethod
+ def _init_generator(
+ generator_api: LLMProvider,
+ generator_api_params: Optional[Dict[str, Any]]
+ ) -> Union[OpenAIGenerator, AzureOpenAIGenerator, AmazonBedrockGenerator, VertexAIGeminiGenerator]:
+ """
+ Initialize the chat generator based on the specified API provider and parameters.
+ """
+ if generator_api == LLMProvider.OPENAI:
+ return OpenAIGenerator(**generator_api_params)
+ if generator_api == LLMProvider.OPENAI_AZURE:
+ return AzureOpenAIGenerator(**generator_api_params)
+ if generator_api == LLMProvider.AWS_BEDROCK:
+ amazon_bedrock_generator.check()
+ return AmazonBedrockGenerator(**generator_api_params)
+ if generator_api == LLMProvider.GOOGLE_VERTEX:
+ vertex_ai_gemini_generator.check()
+ return VertexAIGeminiGenerator(**generator_api_params)
+ raise ValueError(f"Unsupported generator API: {generator_api}")
+
+ def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str) -> bool:
+ """
+ Output must be a valid JSON with the expected keys.
+
+ :param expected:
+ Names of expected outputs
+ :param received:
+ Names of received outputs
+
+ :raises ValueError:
+ If the output is not a valid JSON with the expected keys:
+ - with `raise_on_failure` set to True a ValueError is raised.
+ - with `raise_on_failure` set to False a warning is issued and False is returned.
+
+ :returns:
+ True if the received output is a valid JSON with the expected keys, False otherwise.
+ """
+ try:
+ parsed_output = json.loads(received)
+ except json.JSONDecodeError:
+ msg = "Response from LLM is not a valid JSON."
+ if self.raise_on_failure:
+ raise ValueError(msg)
+ logger.warning(msg)
+ return False
+
+ if not all(output in parsed_output for output in expected):
+ msg = f"Expected response from LLM to be a JSON with keys {expected}, got {received}."
+ if self.raise_on_failure:
+ raise ValueError(msg)
+ logger.warning(msg)
+ return False
+
+ return True
+
+ def to_dict(self) -> Dict[str, Any]:
+ """
+ Serializes the component to a dictionary.
+
+ :returns:
+ Dictionary with serialized data.
+ """
+
+ llm_provider = self.llm_provider.to_dict()
+
+ return default_to_dict(
+ self,
+ prompt=self.prompt,
+ input_text=self.input_text,
+ expected_keys=self.expected_keys,
+ raise_on_failure=self.raise_on_failure,
+ generator_api=self.generator_api.value,
+ generator_api_params=llm_provider["init_parameters"],
+ )
+
+ @classmethod
+ def from_dict(cls, data: Dict[str, Any]) -> "LLMMetadataExtractor":
+ """
+ Deserializes the component from a dictionary.
+
+ :param data:
+ Dictionary with serialized data.
+ :returns:
+ An instance of the component.
+ """
+
+ init_parameters = data.get("init_parameters", {})
+ if "generator_api" in init_parameters:
+ data["init_parameters"]["generator_api"] = LLMProvider.from_str(data["init_parameters"]["generator_api"])
+ if "generator_api_params" in init_parameters:
+ deserialize_secrets_inplace(data["init_parameters"]["generator_api_params"], keys=["api_key"])
+ return default_from_dict(cls, data)
+
+
+ @component.output_types(documents=List[Document], errors=Dict[str, Any])
+ def run(self, documents: List[Document]) -> Dict[str, Any]:
+ """
+ Extract metadata from documents using a Language Model.
+
+ :param documents: List of documents to extract metadata from.
+ :returns:
+ A dictionary with the keys:
+ - "documents": List of documents with extracted metadata.
+ - "errors": A dictionary with document IDs as keys and error messages as values.
+ """
+ errors = {}
+ for document in documents:
+ prompt_with_doc = self.builder.run(input_text=document.content)
+ result = self.llm_provider.run(prompt=prompt_with_doc["prompt"])
+ llm_answer = result["replies"][0]
+ if self.is_valid_json_and_has_expected_keys(expected=self.expected_keys, received=llm_answer):
+ extracted_metadata = json.loads(llm_answer)
+ for k in self.expected_keys:
+ document.meta[k] = extracted_metadata[k]
+ else:
+ errors[document.id] = llm_answer
+
+ return {"documents": documents, "errors": errors}
diff --git a/pyproject.toml b/pyproject.toml
index fb97b136..93c3b910 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,6 +59,9 @@ extra-dependencies = [
"fastapi",
# Tool
"jsonschema",
+ # LLMMetadataExtractor dependencies
+ "amazon-bedrock-haystack>=1.0.2",
+ "google-vertex-haystack>=2.0.0",
]
[tool.hatch.envs.test.scripts]
diff --git a/test/components/extractors/__init__.py b/test/components/extractors/__init__.py
new file mode 100644
index 00000000..c1764a6e
--- /dev/null
+++ b/test/components/extractors/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/test/components/extractors/test_llm_metadata_extractor.py b/test/components/extractors/test_llm_metadata_extractor.py
new file mode 100644
index 00000000..72686d09
--- /dev/null
+++ b/test/components/extractors/test_llm_metadata_extractor.py
@@ -0,0 +1,161 @@
+import os
+import pytest
+
+from haystack import Pipeline, Document
+from haystack.components.builders import PromptBuilder
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack_experimental.components import LLMMetadataExtractor
+from haystack_experimental.components.extractors import LLMProvider
+
+
+class TestLLMMetadataExtractor:
+
+ def test_init_default(self, monkeypatch):
+ monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+ extractor = LLMMetadataExtractor(
+ prompt="prompt {{test}}",
+ expected_keys=["key1", "key2"],
+ generator_api=LLMProvider.OPENAI,
+ input_text="test"
+ )
+ assert isinstance(extractor.builder, PromptBuilder)
+ assert extractor.generator_api == LLMProvider.OPENAI
+ assert extractor.expected_keys == ["key1", "key2"]
+ assert extractor.raise_on_failure is False
+ assert extractor.input_text == "test"
+
+ def test_init_with_parameters(self, monkeypatch):
+ monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+ extractor = LLMMetadataExtractor(
+ prompt="prompt {{test}}",
+ expected_keys=["key1", "key2"],
+ raise_on_failure=True,
+ generator_api=LLMProvider.OPENAI,
+ generator_api_params={
+ 'model': 'gpt-3.5-turbo',
+ 'generation_kwargs': {"temperature": 0.5}
+ },
+ input_text="test")
+ assert isinstance(extractor.builder, PromptBuilder)
+ assert extractor.expected_keys == ["key1", "key2"]
+ assert extractor.raise_on_failure is True
+ assert extractor.generator_api == LLMProvider.OPENAI
+ assert extractor.generator_api_params == {
+ 'model': 'gpt-3.5-turbo',
+ 'generation_kwargs': {"temperature": 0.5}
+ }
+
+ def test_to_dict(self, monkeypatch):
+ monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+ extractor = LLMMetadataExtractor(
+ prompt="some prompt that was used with the LLM {{test}}",
+ expected_keys=["key1", "key2"],
+ generator_api=LLMProvider.OPENAI,
+ input_text="test",
+ generator_api_params={'model': 'gpt-4o-mini', 'generation_kwargs': {"temperature": 0.5}},
+ raise_on_failure=True)
+ extractor_dict = extractor.to_dict()
+ assert extractor_dict == {
+ 'type': 'haystack_experimental.components.extractors.llm_metadata_extractor.LLMMetadataExtractor',
+ 'init_parameters': {
+ 'prompt': 'some prompt that was used with the LLM {{test}}',
+ 'expected_keys': ['key1', 'key2'],
+ 'raise_on_failure': True,
+ 'input_text': 'test',
+ 'generator_api': 'openai',
+ 'generator_api_params': {
+ 'api_base_url': None,
+ 'api_key': {'env_vars': ['OPENAI_API_KEY'],'strict': True,'type': 'env_var'},
+ 'generation_kwargs': {"temperature": 0.5},
+ 'model': 'gpt-4o-mini',
+ 'organization': None,
+ 'streaming_callback': None,
+ 'system_prompt': None,
+ },
+ }
+ }
+
+ def test_from_dict(self, monkeypatch):
+ monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+ extractor_dict = {
+ 'type': 'haystack_experimental.components.extractors.llm_metadata_extractor.LLMMetadataExtractor',
+ 'init_parameters': {
+ 'prompt': 'some prompt that was used with the LLM {{test}}',
+ 'expected_keys': ['key1', 'key2'],
+ 'raise_on_failure': True,
+ 'input_text': 'test',
+ 'generator_api': 'openai',
+ 'generator_api_params': {
+ 'api_base_url': None,
+ 'api_key': {'env_vars': ['OPENAI_API_KEY'], 'strict': True, 'type': 'env_var'},
+ 'generation_kwargs': {},
+ 'model': 'gpt-4o-mini',
+ 'organization': None,
+ 'streaming_callback': None,
+ 'system_prompt': None,
+ }
+ }
+ }
+ extractor = LLMMetadataExtractor.from_dict(extractor_dict)
+ assert extractor.raise_on_failure is True
+ assert extractor.expected_keys == ["key1", "key2"]
+ assert extractor.prompt == "some prompt that was used with the LLM {{test}}"
+ assert extractor.generator_api == LLMProvider.OPENAI
+
+ @pytest.mark.integration
+ @pytest.mark.skipif(
+ not os.environ.get("OPENAI_API_KEY", None),
+ reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
+ )
+ def test_live_run(self):
+ docs = [
+ Document(content="deepset was founded in 2018 in Berlin, and is known for its Haystack framework"),
+ Document(content="Hugging Face is a company founded in Paris, France and is known for its Transformers library")
+ ]
+
+ ner_prompt = """
+ Given a text and a list of entity types, identify all entities of those types from the text.
+
+ -Steps-
+ 1. Identify all entities. For each identified entity, extract the following information:
+ - entity_name: Name of the entity, capitalized
+ - entity_type: One of the following types: [organization, person, product, service, industry]
+ Format each entity as {"entity": , "entity_type": }
+
+ 2. Return output in a single list with all the entities identified in steps 1.
+
+ -Examples-
+ ######################
+ Example 1:
+ entity_types: [organization, product, service, industry, investment strategy, market trend]
+ text:
+ Another area of strength is our co-brand issuance. Visa is the primary network partner for eight of the top 10 co-brand partnerships in the US today and we are pleased that Visa has finalized a multi-year extension of our successful credit co-branded partnership with Alaska Airlines, a portfolio that benefits from a loyal customer base and high cross-border usage.
+ We have also had significant co-brand momentum in CEMEA. First, we launched a new co-brand card in partnership with Qatar Airways, British Airways and the National Bank of Kuwait. Second, we expanded our strong global Marriott relationship to launch Qatar's first hospitality co-branded card with Qatar Islamic Bank. Across the United Arab Emirates, we now have exclusive agreements with all the leading airlines marked by a recent agreement with Emirates Skywards.
+ And we also signed an inaugural Airline co-brand agreement in Morocco with Royal Air Maroc. Now newer digital issuers are equally
+ ------------------------
+ output:
+ {"entities": [{"entity": "Visa", "entity_type": "company"}, {"entity": "Alaska Airlines", "entity_type": "company"}, {"entity": "Qatar Airways", "entity_type": "company"}, {"entity": "British Airways", "entity_type": "company"}, {"entity": "National Bank of Kuwait", "entity_type": "company"}, {"entity": "Marriott", "entity_type": "company"}, {"entity": "Qatar Islamic Bank", "entity_type": "company"}, {"entity": "Emirates Skywards", "entity_type": "company"}, {"entity": "Royal Air Maroc", "entity_type": "company"}]}
+ #############################
+
+ -Real Data-
+ ######################
+ entity_types: [company, organization, person, country, product, service]
+ text: {{input_text}}
+ ######################
+ output:
+ """
+
+ doc_store = InMemoryDocumentStore()
+ extractor = LLMMetadataExtractor(prompt=ner_prompt, expected_keys=["entities"], input_text="input_text", generator_api=LLMProvider.OPENAI)
+ writer = DocumentWriter(document_store=doc_store)
+ pipeline = Pipeline()
+ pipeline.add_component("extractor", extractor)
+ pipeline.add_component("doc_writer", writer)
+ pipeline.connect("extractor.documents", "doc_writer.documents")
+ pipeline.run(data={"documents": docs})
+
+ doc_store_docs = doc_store.filter_documents()
+ assert len(doc_store_docs) == 2
+ assert "entities" in doc_store_docs[0].meta
+ assert "entities" in doc_store_docs[1].meta