From 8893bdd83bb78be77cb528d4ff7be82c53f7b4ab Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Fri, 5 Jan 2024 15:13:49 -0500 Subject: [PATCH] remove bad examples --- docs/examples/autodataframe.md | 138 ------------------ docs/examples/index.md | 13 +- .../auto_dataframe.py | 77 ---------- .../auto_multi_dataframe.py | 97 ------------ examples/extract-table/run_text.py | 94 ++++++++++++ .../extract-table/{run.py => run_vision.py} | 69 ++++++++- mkdocs.yml | 5 +- 7 files changed, 168 insertions(+), 325 deletions(-) delete mode 100644 docs/examples/autodataframe.md delete mode 100644 examples/automatic_dataframe_extraction/auto_dataframe.py delete mode 100644 examples/automatic_dataframe_extraction/auto_multi_dataframe.py create mode 100644 examples/extract-table/run_text.py rename examples/extract-table/{run.py => run_vision.py} (50%) diff --git a/docs/examples/autodataframe.md b/docs/examples/autodataframe.md deleted file mode 100644 index 2607c7573..000000000 --- a/docs/examples/autodataframe.md +++ /dev/null @@ -1,138 +0,0 @@ -# Example: Converting Text into Dataframes - -In this example, we'll demonstrate how to convert a text into dataframes using OpenAI Function Call. We will define the necessary data structures using Pydantic and show how to convert the text into dataframes. - -!!! note "Motivation" - - Often times when we parse data we have an opportunity to extract structured data, what if we could extract an arbitrary number of tables with arbitrary schemas? By pulling out dataframes we could write tables or .csv files and attach them to our retrieved data. - -## Defining the Data Structures - -Let's start by defining the data structures required for this task: `RowData`, `Dataframe`, and `Database`. - -```python -from pydantic import Field, BaseModel -from typing import List, Any - - -class RowData(BaseModel): - row: List[Any] = Field(..., description="The values for each row") - citation: str = Field( - ..., description="The citation for this row from the original source data" - ) - - -class Dataframe(BaseModel): - """ - Class representing a dataframe. This class is used to convert - data into a frame that can be used by pandas. - """ - - name: str = Field(..., description="The name of the dataframe") - data: List[RowData] = Field( - ..., - description="Correct rows of data aligned to column names, Nones are allowed", - ) - columns: List[str] = Field( - ..., - description="Column names relevant from source data, should be in snake_case", - ) - - def to_pandas(self): - import pandas as pd - - columns = self.columns + ["citation"] - data = [row.row + [row.citation] for row in self.data] - - return pd.DataFrame(data=data, columns=columns) - - -class Database(BaseModel): - """ - A set of correct named and defined tables as dataframes - """ - - tables: List[Dataframe] = Field( - ..., - description="List of tables in the database", - ) -``` - -The `RowData` class represents a single row of data in the dataframe. It contains a `row` attribute for the values in each row and a `citation` attribute for the citation from the original source data. - -The `Dataframe` class represents a dataframe and consists of a `name` attribute, a list of `RowData` objects in the `data` attribute, and a list of column names in the `columns` attribute. It also provides a `to_pandas` method to convert the dataframe into a Pandas DataFrame. - -The `Database` class represents a set of tables in a database. It contains a list of `Dataframe` objects in the `tables` attribute. - -## Using the Prompt Pipeline - -To convert a text into dataframes, we'll use the Prompt Pipeline in OpenAI Function Call. We can define a function `dataframe` that takes a text as input and returns a `Database` object. - -```python -import instructor -from openai import OpenAI - -# Apply the patch to the OpenAI client -# enables response_model keyword -client = instructor.patch(OpenAI()) - - -def dataframe(data: str) -> Database: - return client.chat.completions.create( - model="gpt-4-0613", - temperature=0.1, - response_model=Database, - messages=[ - { - "role": "system", - "content": """Map this data into a dataframe a - nd correctly define the correct columns and rows""", - }, - { - "role": "user", - "content": f"{data}", - }, - ], - max_tokens=1000, - ) -``` - -The `dataframe` function takes a string `data` as input and creates a completion using the Prompt Pipeline. It prompts the model to map the data into a dataframe and define the correct columns and rows. The resulting completion is then converted into a `Database` object. - -## Evaluating an Example - -Let's evaluate the example by converting a text into dataframes using the `dataframe` function and print the resulting dataframes. - -```python -dfs = dataframe("""My name is John and I am 25 years old. I live in -New York and I like to play basketball. His name is -Mike and he is 30 years old. He lives in San Francisco -and he likes to play baseball. Sarah is 20 years old -and she lives in Los Angeles. She likes to play tennis. -Her name is Mary and she is 35 years old. -She lives in Chicago. - -On one team 'Tigers' the captain is John and there are 12 players. -On the other team 'Lions' the captain is Mike and there are 10 players. -""") - -for df in dfs.tables: - print(df.name) - print(df.to_pandas()) -``` - -The output will be: - -```sh -People -Name Age City Favorite Sport -0 John 25 New York Basketball -1 Mike 30 San Francisco Baseball -2 Sarah 20 Los Angeles Tennis -3 Mary 35 Chicago None - -Teams -Team Name Captain Number of Players -0 Tigers John 12 -1 Lions Mike 10 -``` diff --git a/docs/examples/index.md b/docs/examples/index.md index 9cff0f0f2..23631a75a 100644 --- a/docs/examples/index.md +++ b/docs/examples/index.md @@ -10,12 +10,11 @@ 6. [How are complex queries decomposed into subqueries in a single request?](planning-tasks.md) 7. [How are entities extracted and resolved from documents?](entity_resolution.md) 8. [How are recursive schemas implemented and understood?](recursive.md) -9. [How are tables extracted automatically from textual data?](autodataframe.md) -10. [How is multi-file code generation accomplished?](gpt-engineer.md) -11. [How is Personally Identifiable Information sanitized from documents?](pii.md) -12. [How are action items and dependencies generated from transcripts?](action_items.md) -13. [How to enable OpenAI's moderation](moderation.md) -14. [How to extract tables from images](extracting_tables.md) -15. [How to generate advertising copy from image inputs](image_to_ad_copy.md) +9. [How is multi-file code generation accomplished?](gpt-engineer.md) +10. [How is Personally Identifiable Information sanitized from documents?](pii.md) +11. [How are action items and dependencies generated from transcripts?](action_items.md) +12. [How to enable OpenAI's moderation](moderation.md) +13. [How to extract tables using GPT-Vision?](extracting_tables.md) +14. [How to generate advertising copy from image inputs](image_to_ad_copy.md) Explore more! diff --git a/examples/automatic_dataframe_extraction/auto_dataframe.py b/examples/automatic_dataframe_extraction/auto_dataframe.py deleted file mode 100644 index 971dcf58a..000000000 --- a/examples/automatic_dataframe_extraction/auto_dataframe.py +++ /dev/null @@ -1,77 +0,0 @@ -from instructor import OpenAISchema -from pydantic import Field -from typing import List, Any -from openai import OpenAI - -client = OpenAI() - - -class RowData(OpenAISchema): - row: List[Any] = Field(..., description="The values for each row") - - -class Dataframe(OpenAISchema): - """ - Class representing a dataframe. This class is used to convert - data into a frame that can be used by pandas. - """ - - data: List[RowData] = Field( - ..., - description="Correct rows of data aligned to column names, Nones are allowed", - ) - columns: List[str] = Field( - ..., - description="Column names relevant from source data, should be in snake_case", - ) - - def to_pandas(self): - import pandas as pd - - columns = self.columns - data = [row.row for row in self.data] - - return pd.DataFrame(data=data, columns=columns) - - -def dataframe(data: str) -> Dataframe: - completion = client.chat.completions.create( - model="gpt-3.5-turbo-0613", - temperature=0.1, - functions=[Dataframe.openai_schema], - function_call={"name": Dataframe.openai_schema["name"]}, - messages=[ - { - "role": "system", - "content": """Map this data into a dataframe a - nd correctly define the correct columns and rows""", - }, - { - "role": "user", - "content": f"{data}", - }, - ], - max_tokens=1000, - ) - return Dataframe.from_response(completion) - - -if __name__ == "__main__": - df = dataframe( - """My name is John and I am 25 years old. I live in - New York and I like to play basketball. His name is - Mike and he is 30 years old. He lives in San Francisco - and he likes to play baseball. Sarah is 20 years old - and she lives in Los Angeles. She likes to play tennis. - Her name is Mary and she is 35 years old. - She lives in Chicago.""" - ) - - print(df.to_pandas()) - """ - name age location hobby - 0 John 25 New York basketball - 1 Mike 30 San Francisco baseball - 2 Sarah 20 Los Angeles tennis - 3 Mary 35 Chicago None - """ diff --git a/examples/automatic_dataframe_extraction/auto_multi_dataframe.py b/examples/automatic_dataframe_extraction/auto_multi_dataframe.py deleted file mode 100644 index 57a43cdfc..000000000 --- a/examples/automatic_dataframe_extraction/auto_multi_dataframe.py +++ /dev/null @@ -1,97 +0,0 @@ -from instructor import OpenAISchema -from pydantic import Field -from typing import List, Any -from openai import OpenAI - -client = OpenAI() - - -class RowData(OpenAISchema): - row: List[Any] = Field(..., description="Correct values for each row") - - -class Dataframe(OpenAISchema): - name: str = Field(..., description="The name of the dataframe") - data: List[RowData] = Field( - ..., - description="Correct rows of data aligned to column names, Nones are allowed", - ) - columns: List[str] = Field( - ..., - description="Column names relevant from source data, should be in snake_case", - ) - - def to_pandas(self): - import pandas as pd - - columns = self.columns - data = [row.row for row in self.data] - - return pd.DataFrame(data=data, columns=columns) - - -class Database(OpenAISchema): - """ - A set of correct named and defined tables as dataframes - Each one should have the right number of columns and correct - values for each. - """ - - tables: List[Dataframe] = Field( - ..., - description="List of tables in the database", - ) - - -def dataframe(data: str) -> Database: - completion = client.chat.completions.create( - model="gpt-4-0613", - temperature=0.0, - functions=[Database.openai_schema], - function_call={"name": Database.openai_schema["name"]}, - messages=[ - { - "role": "system", - "content": """Map this data into a dataframe a - nd correctly define the correct columns and rows""", - }, - { - "role": "user", - "content": f"{data}", - }, - ], - max_tokens=1000, - ) - return Database.from_response(completion) - - -if __name__ == "__main__": - dfs = dataframe( - """My name is John and I am 25 years old. I live in - New York and I like to play basketball. His name is - Mike and he is 30 years old. He lives in San Francisco - and he likes to play baseball. Sarah is 20 years old - and she lives in Los Angeles. She likes to play tennis. - Her name is Mary and she is 35 years old. - She lives in Chicago. - - On one team 'Tigers' the captan is John and there are 12 players. - On the other team 'Lions' the captan is Mike and there are 10 players. - """ - ) - - for df in dfs.tables: - print(df.name) - print(df.to_pandas()) - """ - People - ID Name Age City Favorite Sport - 0 1 John 25 New York Basketball - 1 2 Mike 30 San Francisco Baseball - 2 3 Sarah 20 Los Angeles Tennis - 3 4 Mary 35 Chicago None - Teams - ID Team Name Captain Number of Players - 0 1 Tigers John 12 - 1 2 Lions Mike 10 - """ diff --git a/examples/extract-table/run_text.py b/examples/extract-table/run_text.py new file mode 100644 index 000000000..21848ea81 --- /dev/null +++ b/examples/extract-table/run_text.py @@ -0,0 +1,94 @@ +from openai import OpenAI +from io import StringIO +from typing import Annotated, Any, Iterable +from openai import OpenAI +from pydantic import ( + BaseModel, + BeforeValidator, + PlainSerializer, + InstanceOf, + WithJsonSchema, +) +import pandas as pd +from tomlkit import table +import instructor + + +client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON) + + +def md_to_df(data: Any) -> Any: + if isinstance(data, str): + return ( + pd.read_csv( + StringIO(data), # Get rid of whitespaces + sep="|", + index_col=1, + ) + .dropna(axis=1, how="all") + .iloc[1:] + .map(lambda x: x.strip()) + ) + return data + + +MarkdownDataFrame = Annotated[ + InstanceOf[pd.DataFrame], + BeforeValidator(md_to_df), + PlainSerializer(lambda x: x.to_markdown()), + WithJsonSchema( + { + "type": "string", + "description": """ + The markdown representation of the table, + each one should be tidy, do not try to join tables + that should be seperate""", + } + ), +] + + +class Table(BaseModel): + caption: str + dataframe: MarkdownDataFrame + + +client = instructor.patch(OpenAI()) + + +tables = client.chat.completions.create( + model="gpt-3.5-turbo", + response_model=Iterable[Table], + messages=[ + { + "role": "system", + "content": "Please extract the tables from the following text, merge as much as possible:", + }, + { + "role": "user", + "content": """ + My name is John and I am 25 years old. I live in + New York and I like to play basketball. His name is + Mike and he is 30 years old. He lives in San Francisco + and he likes to play baseball. Sarah is 20 years old + and she lives in Los Angeles. She likes to play tennis. + Her name is Mary and she is 35 years old. + She lives in Chicago. + """, + }, + ], +) + +for table in tables: + print(table.caption) + print(table.dataframe) + print() + """ + People + Age City Hobby + Name + John 25 New York Basketball + Mike 30 San Francisco Baseball + Sarah 20 Los Angeles Tennis + Mary 35 Chicago N/A + """ diff --git a/examples/extract-table/run.py b/examples/extract-table/run_vision.py similarity index 50% rename from examples/extract-table/run.py rename to examples/extract-table/run_vision.py index 740db9f7c..96b9ba9d1 100644 --- a/examples/extract-table/run.py +++ b/examples/extract-table/run_vision.py @@ -1,11 +1,61 @@ from openai import OpenAI +from io import StringIO +from typing import Annotated, Any, Iterable +from openai import OpenAI +from pydantic import ( + BaseModel, + BeforeValidator, + PlainSerializer, + InstanceOf, + WithJsonSchema, +) +import pandas as pd +import instructor + + +client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON) + + +def md_to_df(data: Any) -> Any: + if isinstance(data, str): + return ( + pd.read_csv( + StringIO(data), # Get rid of whitespaces + sep="|", + index_col=1, + ) + .dropna(axis=1, how="all") + .iloc[1:] + .map(lambda x: x.strip()) + ) + return data + + +MarkdownDataFrame = Annotated[ + InstanceOf[pd.DataFrame], + BeforeValidator(md_to_df), + PlainSerializer(lambda x: x.to_markdown()), + WithJsonSchema( + { + "type": "string", + "description": """ + The markdown representation of the table, + each one should be tidy, do not try to join tables + that should be seperate""", + } + ), +] + -client = OpenAI() +class Table(BaseModel): + caption: str + dataframe: MarkdownDataFrame -response = client.chat.completions.create( +tables = client.chat.completions.create( model="gpt-4-vision-preview", max_tokens=1000, + response_model=Iterable[Table], messages=[ { "role": "user", @@ -38,4 +88,17 @@ ], ) -print(response.choices[0].message.content) +for table in tables: + print(table.caption) + print(table.dataframe) + print() + """ + D1 App Retention Rates July 2023 (Ireland & U.K.) + Ireland UK + Category + Education 14% 12% + Entertainment 13% 11% + Games 26% 25% + Social 27% 18% + Utilities 11% 9% + """ diff --git a/mkdocs.yml b/mkdocs.yml index 397ec5a79..20287a07e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -149,8 +149,7 @@ nav: - Overview: 'examples/index.md' - Text Classification: 'examples/classification.md' - Self Critique: 'examples/self_critique.md' - - Image Extracting Tables: 'examples/extracting_tables.md' - - Image to Ad Copy: 'examples/image_to_ad_copy.md' + - Extracting Tables: 'examples/extracting_tables.md' - Moderation: 'examples/moderation.md' - Citations: 'examples/exact_citations.md' - Knowledge Graph: 'examples/knowledge_graph.md' @@ -158,11 +157,11 @@ nav: - Search Queries: 'examples/search.md' - Query Decomposition: 'examples/planning-tasks.md' - Recursive Schemas: 'examples/recursive.md' - - Table Extraction: 'examples/autodataframe.md' - Action Item and Dependency Mapping: 'examples/action_items.md' - Multi-File Code Generation: 'examples/gpt-engineer.md' - PII Data Sanitization: 'examples/pii.md' - Open Source: 'examples/open_source.md' + - Image to Ad Copy: 'examples/image_to_ad_copy.md' - CLI Reference: - "Introduction": "cli/index.md" - "Finetuning GPT-3.5": "cli/finetune.md"