From 38038037b0bef94d54d775bb138e472e73cf0cc6 Mon Sep 17 00:00:00 2001 From: C Cheng <10414576+ccheng26@users.noreply.github.com> Date: Mon, 2 Dec 2024 10:00:22 -0500 Subject: [PATCH] feat: add question generation for imagine la and bem dataset (#144) --- .../generate_q_a_pairs.py | 7 +- .../q_a_generation.ipynb | 146 ++++++++++++++++-- 2 files changed, 141 insertions(+), 12 deletions(-) diff --git a/app/notebooks/question_answer_generator/generate_q_a_pairs.py b/app/notebooks/question_answer_generator/generate_q_a_pairs.py index cdc9060d..ec4776ab 100644 --- a/app/notebooks/question_answer_generator/generate_q_a_pairs.py +++ b/app/notebooks/question_answer_generator/generate_q_a_pairs.py @@ -37,6 +37,7 @@ class QuestionAnswerAttributes(QuestionAnswerPair): document_source: str document_id: UUID chunk_id: Optional[UUID] + dataset: str class QuestionAnswerList(BaseModel): @@ -69,7 +70,10 @@ def generate_question_answer_pairs(llm: str, message: str) -> QuestionAnswerList def process_document_or_chunk( - document: Document | Chunk, num_of_chunks: int, llm: str + document: Document | Chunk, + num_of_chunks: int, + llm: str, + dataset: str, ) -> list[QuestionAnswerAttributes]: generated_question_answers = generate_question_answer_pairs( llm=llm, @@ -88,6 +92,7 @@ def process_document_or_chunk( question=generated_question_answer.question, answer=generated_question_answer.answer, chunk_id=None if is_document else document.id, + dataset=dataset, ) question_answer_list.append(question_answer_item) diff --git a/app/notebooks/question_answer_generator/q_a_generation.ipynb b/app/notebooks/question_answer_generator/q_a_generation.ipynb index 39a50f30..375406d0 100644 --- a/app/notebooks/question_answer_generator/q_a_generation.ipynb +++ b/app/notebooks/question_answer_generator/q_a_generation.ipynb @@ -11,10 +11,38 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "86c9ace5642e4dc99a77870a73f6bb95", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "IntText(value=5, description='Number of questions per chunk/document:')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "009e84092ef348b58ab6b9a777946bdf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='OpenAI LLM Model:', index=1, options=('gpt-3.5-turbo-instruct', 'gpt-4o', 'gpt-4o-mini')…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import logging\n", - "from sqlalchemy import select\n", "import ipywidgets as widgets\n", "\n", "from notebooks.question_answer_generator.generate_q_a_pairs import process_document_or_chunk, write_question_answer_json_to_csv\n", @@ -41,33 +69,129 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2624808f256c43e793bb61d7a6ae3aa8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(description='Question source:', options=('document', 'chunk'), value='document')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "question_gen_selection = widgets.RadioButtons(\n", + " options=['document', 'chunk'],\n", + " value='document',\n", + " description='Question source:',\n", + " disabled=False\n", + ")\n", + "display(question_gen_selection)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "df7a2ace17ba401aabdf124a024373e0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "SelectMultiple(description='Dataset source:', index=(0, 1), options=('Imagine LA', 'CA EDD', 'BEM'), value=('I…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "question_gen_selection = input(\"Generate questions by chunk or document?\")" + "question_dataset = widgets.SelectMultiple(\n", + " options=['Imagine LA', 'CA EDD', 'BEM'],\n", + " value=['Imagine LA', 'CA EDD' ],\n", + " description='Dataset source:',\n", + " disabled=False\n", + ")\n", + "display(question_dataset)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-26 19:03:19,193 - INFO - Constructed database configuration\n", + "2024-11-26 19:03:19,212 - INFO - connected to postgres db\n", + "2024-11-26 19:03:19,212 - WARNING - database connection is not using SSL\n" + ] + }, + { + "ename": "ProgrammingError", + "evalue": "(psycopg.errors.UndefinedFunction) operator does not exist: text && character varying\nLINE 3: WHERE document.dataset && $1::VARCHAR\n ^\nHINT: No operator matches the given name and argument types. You might need to add explicit type casts.\n[SQL: SELECT document.name AS document_name, document.content AS document_content, document.source AS document_source, document.dataset AS document_dataset, document.program AS document_program, document.region AS document_region, document.id AS document_id, document.created_at AS document_created_at, document.updated_at AS document_updated_at \nFROM document \nWHERE document.dataset && %(dataset_1)s::VARCHAR]\n[SQL parameters hidden due to hide_parameters=True]\n(Background on this error at: https://sqlalche.me/e/20/f405)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mUndefinedFunction\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1967\u001b[0m, in \u001b[0;36mConnection._exec_single_context\u001b[0;34m(self, dialect, context, statement, parameters)\u001b[0m\n\u001b[1;32m 1966\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m evt_handled:\n\u001b[0;32m-> 1967\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdialect\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdo_execute\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1968\u001b[0m \u001b[43m \u001b[49m\u001b[43mcursor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstr_statement\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meffective_parameters\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\n\u001b[1;32m 1969\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1971\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_has_events \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine\u001b[38;5;241m.\u001b[39m_has_events:\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/default.py:941\u001b[0m, in \u001b[0;36mDefaultDialect.do_execute\u001b[0;34m(self, cursor, statement, parameters, context)\u001b[0m\n\u001b[1;32m 940\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdo_execute\u001b[39m(\u001b[38;5;28mself\u001b[39m, cursor, statement, parameters, context\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m--> 941\u001b[0m \u001b[43mcursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstatement\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/psycopg/cursor.py:97\u001b[0m, in \u001b[0;36mCursor.execute\u001b[0;34m(self, query, params, prepare, binary)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m e\u001b[38;5;241m.\u001b[39m_NO_TRACEBACK \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 97\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ex\u001b[38;5;241m.\u001b[39mwith_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", + "\u001b[0;31mUndefinedFunction\u001b[0m: operator does not exist: text && character varying\nLINE 3: WHERE document.dataset && $1::VARCHAR\n ^\nHINT: No operator matches the given name and argument types. You might need to add explicit type casts.", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mProgrammingError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[21], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m app_config\u001b[38;5;241m.\u001b[39mdb_session() \u001b[38;5;28;01mas\u001b[39;00m db_session:\n\u001b[1;32m 2\u001b[0m selected_dataset \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(question_dataset\u001b[38;5;241m.\u001b[39mvalue)\n\u001b[0;32m----> 4\u001b[0m documents \u001b[38;5;241m=\u001b[39m \u001b[43mdb_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDocument\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDocument\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m&&\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselected_dataset\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(documents)\n\u001b[1;32m 6\u001b[0m fields \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquestion\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124manswer\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdocument_name\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdocument_source\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdocument_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchunk_id\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/orm/query.py:2673\u001b[0m, in \u001b[0;36mQuery.all\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 2651\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mall\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[_T]:\n\u001b[1;32m 2652\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return the results represented by this :class:`_query.Query`\u001b[39;00m\n\u001b[1;32m 2653\u001b[0m \u001b[38;5;124;03m as a list.\u001b[39;00m\n\u001b[1;32m 2654\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2671\u001b[0m \u001b[38;5;124;03m :meth:`_engine.Result.scalars` - v2 comparable method.\u001b[39;00m\n\u001b[1;32m 2672\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2673\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iter\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mall()\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/orm/query.py:2827\u001b[0m, in \u001b[0;36mQuery._iter\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 2824\u001b[0m params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_params\n\u001b[1;32m 2826\u001b[0m statement \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_statement_20()\n\u001b[0;32m-> 2827\u001b[0m result: Union[ScalarResult[_T], Result[_T]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2828\u001b[0m \u001b[43m \u001b[49m\u001b[43mstatement\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2829\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2830\u001b[0m \u001b[43m \u001b[49m\u001b[43mexecution_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m_sa_orm_load_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_options\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2831\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2833\u001b[0m \u001b[38;5;66;03m# legacy: automatically set scalars, unique\u001b[39;00m\n\u001b[1;32m 2834\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result\u001b[38;5;241m.\u001b[39m_attributes\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_single_entity\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/orm/session.py:2362\u001b[0m, in \u001b[0;36mSession.execute\u001b[0;34m(self, statement, params, execution_options, bind_arguments, _parent_execute_state, _add_event)\u001b[0m\n\u001b[1;32m 2301\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexecute\u001b[39m(\n\u001b[1;32m 2302\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 2303\u001b[0m statement: Executable,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2309\u001b[0m _add_event: Optional[Any] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 2310\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Result[Any]:\n\u001b[1;32m 2311\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Execute a SQL expression construct.\u001b[39;00m\n\u001b[1;32m 2312\u001b[0m \n\u001b[1;32m 2313\u001b[0m \u001b[38;5;124;03m Returns a :class:`_engine.Result` object representing\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2360\u001b[0m \n\u001b[1;32m 2361\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2362\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_internal\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2363\u001b[0m \u001b[43m \u001b[49m\u001b[43mstatement\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2364\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2365\u001b[0m \u001b[43m \u001b[49m\u001b[43mexecution_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexecution_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2366\u001b[0m \u001b[43m \u001b[49m\u001b[43mbind_arguments\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbind_arguments\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2367\u001b[0m \u001b[43m \u001b[49m\u001b[43m_parent_execute_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_parent_execute_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2368\u001b[0m \u001b[43m \u001b[49m\u001b[43m_add_event\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_add_event\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2369\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/orm/session.py:2247\u001b[0m, in \u001b[0;36mSession._execute_internal\u001b[0;34m(self, statement, params, execution_options, bind_arguments, _parent_execute_state, _add_event, _scalar_result)\u001b[0m\n\u001b[1;32m 2242\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m conn\u001b[38;5;241m.\u001b[39mscalar(\n\u001b[1;32m 2243\u001b[0m statement, params \u001b[38;5;129;01mor\u001b[39;00m {}, execution_options\u001b[38;5;241m=\u001b[39mexecution_options\n\u001b[1;32m 2244\u001b[0m )\n\u001b[1;32m 2246\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m compile_state_cls:\n\u001b[0;32m-> 2247\u001b[0m result: Result[Any] \u001b[38;5;241m=\u001b[39m \u001b[43mcompile_state_cls\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43morm_execute_statement\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2248\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2249\u001b[0m \u001b[43m \u001b[49m\u001b[43mstatement\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2250\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2251\u001b[0m \u001b[43m \u001b[49m\u001b[43mexecution_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2252\u001b[0m \u001b[43m \u001b[49m\u001b[43mbind_arguments\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2253\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2254\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2256\u001b[0m result \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39mexecute(\n\u001b[1;32m 2257\u001b[0m statement, params \u001b[38;5;129;01mor\u001b[39;00m {}, execution_options\u001b[38;5;241m=\u001b[39mexecution_options\n\u001b[1;32m 2258\u001b[0m )\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/orm/context.py:305\u001b[0m, in \u001b[0;36mAbstractORMCompileState.orm_execute_statement\u001b[0;34m(cls, session, statement, params, execution_options, bind_arguments, conn)\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 296\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21morm_execute_statement\u001b[39m(\n\u001b[1;32m 297\u001b[0m \u001b[38;5;28mcls\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 303\u001b[0m conn,\n\u001b[1;32m 304\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Result:\n\u001b[0;32m--> 305\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 306\u001b[0m \u001b[43m \u001b[49m\u001b[43mstatement\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexecution_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexecution_options\u001b[49m\n\u001b[1;32m 307\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39morm_setup_cursor_result(\n\u001b[1;32m 309\u001b[0m session,\n\u001b[1;32m 310\u001b[0m statement,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 314\u001b[0m result,\n\u001b[1;32m 315\u001b[0m )\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1418\u001b[0m, in \u001b[0;36mConnection.execute\u001b[0;34m(self, statement, parameters, execution_options)\u001b[0m\n\u001b[1;32m 1416\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mObjectNotExecutableError(statement) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 1417\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1418\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmeth\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1419\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1420\u001b[0m \u001b[43m \u001b[49m\u001b[43mdistilled_parameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1421\u001b[0m \u001b[43m \u001b[49m\u001b[43mexecution_options\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mNO_OPTIONS\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1422\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/sql/elements.py:515\u001b[0m, in \u001b[0;36mClauseElement._execute_on_connection\u001b[0;34m(self, connection, distilled_params, execution_options)\u001b[0m\n\u001b[1;32m 513\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m TYPE_CHECKING:\n\u001b[1;32m 514\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m, Executable)\n\u001b[0;32m--> 515\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_clauseelement\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 516\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdistilled_params\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexecution_options\u001b[49m\n\u001b[1;32m 517\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 519\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mObjectNotExecutableError(\u001b[38;5;28mself\u001b[39m)\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1640\u001b[0m, in \u001b[0;36mConnection._execute_clauseelement\u001b[0;34m(self, elem, distilled_parameters, execution_options)\u001b[0m\n\u001b[1;32m 1628\u001b[0m compiled_cache: Optional[CompiledCacheType] \u001b[38;5;241m=\u001b[39m execution_options\u001b[38;5;241m.\u001b[39mget(\n\u001b[1;32m 1629\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompiled_cache\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine\u001b[38;5;241m.\u001b[39m_compiled_cache\n\u001b[1;32m 1630\u001b[0m )\n\u001b[1;32m 1632\u001b[0m compiled_sql, extracted_params, cache_hit \u001b[38;5;241m=\u001b[39m elem\u001b[38;5;241m.\u001b[39m_compile_w_cache(\n\u001b[1;32m 1633\u001b[0m dialect\u001b[38;5;241m=\u001b[39mdialect,\n\u001b[1;32m 1634\u001b[0m compiled_cache\u001b[38;5;241m=\u001b[39mcompiled_cache,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1638\u001b[0m linting\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdialect\u001b[38;5;241m.\u001b[39mcompiler_linting \u001b[38;5;241m|\u001b[39m compiler\u001b[38;5;241m.\u001b[39mWARN_LINTING,\n\u001b[1;32m 1639\u001b[0m )\n\u001b[0;32m-> 1640\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_context\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1641\u001b[0m \u001b[43m \u001b[49m\u001b[43mdialect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1642\u001b[0m \u001b[43m \u001b[49m\u001b[43mdialect\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecution_ctx_cls\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_init_compiled\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1643\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled_sql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1644\u001b[0m \u001b[43m \u001b[49m\u001b[43mdistilled_parameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1645\u001b[0m \u001b[43m \u001b[49m\u001b[43mexecution_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1646\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled_sql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1647\u001b[0m \u001b[43m \u001b[49m\u001b[43mdistilled_parameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1648\u001b[0m \u001b[43m \u001b[49m\u001b[43melem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1649\u001b[0m \u001b[43m \u001b[49m\u001b[43mextracted_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1650\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_hit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_hit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1651\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_events:\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdispatch\u001b[38;5;241m.\u001b[39mafter_execute(\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1655\u001b[0m elem,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1659\u001b[0m ret,\n\u001b[1;32m 1660\u001b[0m )\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1846\u001b[0m, in \u001b[0;36mConnection._execute_context\u001b[0;34m(self, dialect, constructor, statement, parameters, execution_options, *args, **kw)\u001b[0m\n\u001b[1;32m 1844\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exec_insertmany_context(dialect, context)\n\u001b[1;32m 1845\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1846\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_exec_single_context\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1847\u001b[0m \u001b[43m \u001b[49m\u001b[43mdialect\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstatement\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\n\u001b[1;32m 1848\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1986\u001b[0m, in \u001b[0;36mConnection._exec_single_context\u001b[0;34m(self, dialect, context, statement, parameters)\u001b[0m\n\u001b[1;32m 1983\u001b[0m result \u001b[38;5;241m=\u001b[39m context\u001b[38;5;241m.\u001b[39m_setup_result_proxy()\n\u001b[1;32m 1985\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m-> 1986\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_handle_dbapi_exception\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1987\u001b[0m \u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstr_statement\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meffective_parameters\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcursor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\n\u001b[1;32m 1988\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1990\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/base.py:2355\u001b[0m, in \u001b[0;36mConnection._handle_dbapi_exception\u001b[0;34m(self, e, statement, parameters, cursor, context, is_sub_exec)\u001b[0m\n\u001b[1;32m 2353\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m should_wrap:\n\u001b[1;32m 2354\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m sqlalchemy_exception \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 2355\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m sqlalchemy_exception\u001b[38;5;241m.\u001b[39mwith_traceback(exc_info[\u001b[38;5;241m2\u001b[39m]) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 2356\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2357\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m exc_info[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1967\u001b[0m, in \u001b[0;36mConnection._exec_single_context\u001b[0;34m(self, dialect, context, statement, parameters)\u001b[0m\n\u001b[1;32m 1965\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 1966\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m evt_handled:\n\u001b[0;32m-> 1967\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdialect\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdo_execute\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1968\u001b[0m \u001b[43m \u001b[49m\u001b[43mcursor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstr_statement\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meffective_parameters\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\n\u001b[1;32m 1969\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1971\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_has_events \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine\u001b[38;5;241m.\u001b[39m_has_events:\n\u001b[1;32m 1972\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdispatch\u001b[38;5;241m.\u001b[39mafter_cursor_execute(\n\u001b[1;32m 1973\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1974\u001b[0m cursor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1978\u001b[0m context\u001b[38;5;241m.\u001b[39mexecutemany,\n\u001b[1;32m 1979\u001b[0m )\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/sqlalchemy/engine/default.py:941\u001b[0m, in \u001b[0;36mDefaultDialect.do_execute\u001b[0;34m(self, cursor, statement, parameters, context)\u001b[0m\n\u001b[1;32m 940\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdo_execute\u001b[39m(\u001b[38;5;28mself\u001b[39m, cursor, statement, parameters, context\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m--> 941\u001b[0m \u001b[43mcursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstatement\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/decision-support-tool-9TtSrW0h-py3.12/lib/python3.12/site-packages/psycopg/cursor.py:97\u001b[0m, in \u001b[0;36mCursor.execute\u001b[0;34m(self, query, params, prepare, binary)\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_conn\u001b[38;5;241m.\u001b[39mwait(\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_execute_gen(query, params, prepare\u001b[38;5;241m=\u001b[39mprepare, binary\u001b[38;5;241m=\u001b[39mbinary)\n\u001b[1;32m 95\u001b[0m )\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m e\u001b[38;5;241m.\u001b[39m_NO_TRACEBACK \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 97\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ex\u001b[38;5;241m.\u001b[39mwith_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", + "\u001b[0;31mProgrammingError\u001b[0m: (psycopg.errors.UndefinedFunction) operator does not exist: text && character varying\nLINE 3: WHERE document.dataset && $1::VARCHAR\n ^\nHINT: No operator matches the given name and argument types. You might need to add explicit type casts.\n[SQL: SELECT document.name AS document_name, document.content AS document_content, document.source AS document_source, document.dataset AS document_dataset, document.program AS document_program, document.region AS document_region, document.id AS document_id, document.created_at AS document_created_at, document.updated_at AS document_updated_at \nFROM document \nWHERE document.dataset && %(dataset_1)s::VARCHAR]\n[SQL parameters hidden due to hide_parameters=True]\n(Background on this error at: https://sqlalche.me/e/20/f405)" + ] + } + ], "source": [ "with app_config.db_session() as db_session:\n", - " documents = db_session.execute(select(Document)).scalars().all()\n", - " fields = [\"question\", \"answer\", \"document_name\", \"document_source\", \"document_id\", \"chunk_id\"]\n", + " selected_dataset = list(question_dataset.value)\n", + " documents = db_session.query(Document).filter(Document.dataset.in_(selected_dataset)).all()\n", + "\n", + " fields = [\"question\", \"answer\", \"document_name\", \"document_source\", \"dataset\", \"document_id\", \"chunk_id\"]\n", + "\n", " logging.basicConfig(level=logging.INFO, format=\"%(asctime)s - %(levelname)s - %(message)s\")\n", " logging.info(f'Start processing with llm {llm_model.value}')\n", "\n", + " dataset_list = \", \".join(question_dataset.value)\n", " for document in documents:\n", " chunk_list= document.chunks\n", - " if question_gen_selection == \"chunk\":\n", + " if question_gen_selection.value == \"chunk\":\n", " for chunk in chunk_list:\n", - " chunk_q_a_json = process_document_or_chunk(document=chunk, num_of_chunks=num_qa_per_chunk_or_doc.value, llm=llm_model.value)\n", + " chunk_q_a_json = process_document_or_chunk(document=chunk, num_of_chunks=num_qa_per_chunk_or_doc.value, llm=llm_model.value, dataset=dataset_list)\n", " write_question_answer_json_to_csv(\"question_answer_pairs.csv\", fields, chunk_q_a_json) \n", " else:\n", - " document_q_a_json = process_document_or_chunk(document=document, num_of_chunks=num_qa_per_chunk_or_doc.value,llm=llm_model.value)\n", + " document_q_a_json = process_document_or_chunk(document=document, num_of_chunks=num_qa_per_chunk_or_doc.value,llm=llm_model.value, dataset=dataset_list)\n", " write_question_answer_json_to_csv(\"question_answer_pairs.csv\", fields, document_q_a_json) \n", " logger.info(\"Finished processing\")" ]