From e98f777ef5d2a5d205ce6c02bc81232139397f34 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Sat, 14 Oct 2023 15:13:05 -0400 Subject: [PATCH] clean up distil example --- examples/distilations/math_finetunes.jsonl | 10 +++ examples/distilations/three_digit_mul.py | 79 ++++++++++++++++++++++ examples/distilations/user_code_global.py | 70 ------------------- instructor/distil.py | 42 +++++++----- 4 files changed, 115 insertions(+), 86 deletions(-) create mode 100644 examples/distilations/math_finetunes.jsonl create mode 100644 examples/distilations/three_digit_mul.py delete mode 100644 examples/distilations/user_code_global.py diff --git a/examples/distilations/math_finetunes.jsonl b/examples/distilations/math_finetunes.jsonl new file mode 100644 index 000000000..7bb6a2b93 --- /dev/null +++ b/examples/distilations/math_finetunes.jsonl @@ -0,0 +1,10 @@ +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(396, b=414, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":396,\"b\":414,\"result\":810}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(812, b=108, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":812,\"b\":108,\"result\":920}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(923, b=561, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":923,\"b\":561,\"result\":1484}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(933, b=756, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":933,\"b\":756,\"result\":1689}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(807, b=389, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":807,\"b\":389,\"result\":1196}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(509, b=776, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":509,\"b\":776,\"result\":1285}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(165, b=442, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":165,\"b\":442,\"result\":607}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(843, b=304, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":843,\"b\":304,\"result\":1147}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(512, b=791, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":512,\"b\":791,\"result\":1303}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} +{"messages": [{"role": "system", "content": "Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n\"\"\"\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n\"\"\""}, {"role": "user", "content": "Return fn(133, b=539, c=\"hello\")"}, {"role": "assistant", "function_call": {"name": "Response", "arguments": "{\"a\":133,\"b\":539,\"result\":672}"}}], "functions": [{"name": "Response", "description": "Correctly extracted `Response` with all the required parameters with correct types", "parameters": {"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}, "result": {"type": "integer"}}, "required": ["a", "b", "result"], "type": "object"}}]} \ No newline at end of file diff --git a/examples/distilations/three_digit_mul.py b/examples/distilations/three_digit_mul.py new file mode 100644 index 000000000..849f3b196 --- /dev/null +++ b/examples/distilations/three_digit_mul.py @@ -0,0 +1,79 @@ +import logging + +from pydantic import BaseModel +from instructor.distil import Instructions + +logging.basicConfig(level=logging.INFO) + +# Usage +instructions = Instructions( + name="three_digit_multiply", + finetune_format="messages", + log_handlers=[ + logging.FileHandler("math_finetunes.jsonl"), + ], +) + + +class Response(BaseModel): + a: int + b: int + result: int + + +@instructions.distil +def fn(a: int, b: int, c: str) -> Response: + """_summary_ + + Args: + a (int): _description_ + b (int): _description_ + c (str): _description_ + + Returns: + Response: _description_ + """ + resp = a + b + return Response(a=a, b=b, result=resp) + + +if __name__ == "__main__": + import random + + # A log will look like this: + log_line = { + "messages": [ + { + "role": "system", + "content": 'Predict the results of this function:\n\ndef fn(a: int, b: int, c: str) -> __main__.Response\n"""\n_summary_\n\nArgs:\n a (int): _description_\n b (int): _description_\n c (str): _description_\n\nReturns:\n Response: _description_\n"""', + }, + {"role": "user", "content": 'Return fn(133, b=539, c="hello")'}, + { + "role": "assistant", + "function_call": { + "name": "Response", + "arguments": '{"a":133,"b":539,"result":672}', + }, + }, + ], + "functions": [ + { + "name": "Response", + "description": "Correctly extracted `Response` with all the required parameters with correct types", + "parameters": { + "properties": { + "a": {"type": "integer"}, + "b": {"type": "integer"}, + "result": {"type": "integer"}, + }, + "required": ["a", "b", "result"], + "type": "object", + }, + } + ], + } + + for _ in range(10): + a = random.randint(100, 999) + b = random.randint(100, 999) + print("returning", fn(a, b=b, c="hello")) diff --git a/examples/distilations/user_code_global.py b/examples/distilations/user_code_global.py deleted file mode 100644 index 0b433bb1f..000000000 --- a/examples/distilations/user_code_global.py +++ /dev/null @@ -1,70 +0,0 @@ -import datetime -import json -import os -import uuid -import requests -import logging - -from pydantic import BaseModel - -from instructor.distil import Instructions - - -class DatasetHandler(logging.Handler): - def __init__(self, dataset_name=None): - super().__init__() - self.url = os.environ.get("INSTRUCTOR_URL") - self.api_key = os.environ.get("INSTRUCTOR_KEY") - self.formatter = logging.Formatter("%(message)s") - self.dataset_name = dataset_name - self.uuid = str(uuid.uuid4()) - - def emit(self, record: logging.LogRecord) -> None: - log_entry = json.loads(self.format(record)) - - new_entry = { - "record": log_entry, - "dataset_name": self.dataset_name, - "batch_id": self.uuid, - "created_at": datetime.datetime.now().isoformat(), - } - - try: - response = requests.post(self.url, data=json.dumps(new_entry)) - if response.status_code != 200: - self.handleError(record) - except Exception: - self.handleError(record) - - -logging.basicConfig(level=logging.INFO) - -# Usage -instructions = Instructions( - name="test_distil", - log_handlers=[ - logging.FileHandler("finetunes.jsonl"), - DatasetHandler("finetunes_fo_test_distil"), - ], -) - - -class Response(BaseModel): - a: int - b: int - result: int - - -@instructions.distil -def fn(a: int, b: int) -> Response: - resp = a + b - return Response(a=a, b=b, result=resp) - - -if __name__ == "__main__": - import random - - for _ in range(10): - a = random.randint(100, 999) - b = random.randint(100, 999) - print("returning", fn(a, b)) diff --git a/instructor/distil.py b/instructor/distil.py index 7b9c4bed7..d17f35913 100644 --- a/instructor/distil.py +++ b/instructor/distil.py @@ -3,7 +3,6 @@ import inspect import json import logging -import os from typing import Any, Callable, List, Optional import uuid @@ -79,10 +78,12 @@ def __init__( name: str = None, id: str = None, log_handlers: List[logging.Handler] = None, + finetune_format: FinetuneFormat = FinetuneFormat.MESSAGES, ): self.name = name self.id = id or str(uuid.uuid4()) self.unique_id = str(uuid.uuid4()) + self.finetune_format = finetune_format self.logger = logging.getLogger(self.name) for handler in log_handlers or []: @@ -93,7 +94,7 @@ def distil( *args, name: str = None, mode: str = "distil", - fine_tune_format: FinetuneFormat = FinetuneFormat.MESSAGES, + fine_tune_format: FinetuneFormat = None, ): """ Decorator to track the function call and response, supports distillation and dispatch modes. @@ -118,6 +119,9 @@ def distil( assert mode in allowed_modes, f"Must be in {allowed_modes}" assert mode == "distil", "Only distil mode is supported at the moment." + if fine_tune_format is None: + fine_tune_format = self.finetune_format + def _wrap_distil(fn): msg = f"Return type hint for {fn} must subclass `pydantic.BaseModel'" assert is_return_type_base_model_or_instance(fn), msg @@ -161,30 +165,25 @@ def track( name = name if name else fn.__name__ base_model: BaseModel = type(resp) - if finetune_format == FinetuneFormat.RAW: - function_body = dict( - fn_name=name, - fn_repr=format_function(fn), - args=args, - kwargs=kwargs, - resp=resp.model_dump(), - schema=base_model.model_json_schema(), - ) - self.logger.info(json.dumps(function_body)) - if finetune_format == FinetuneFormat.MESSAGES: - # This is the format that OpenAI's API expects for a finetune call openai_function_call = openai_schema(base_model).openai_schema function_definition = get_signature_from_fn(fn).replace(fn.__name__, name) + + str_args = ", ".join(map(str, args)) + str_kwargs = ( + ", ".join(f"{k}={json.dumps(v)}" for k, v in kwargs.items()) or None + ) + call_args = ", ".join(filter(None, [str_args, str_kwargs])) + function_body = { "messages": [ { "role": "system", - "content": f"Return the response from the function call.\n\n {function_definition}", + "content": f"Predict the results of this function:\n\n{function_definition}", }, { "role": "user", - "content": f"Return the results of the function with the following arguments:\n\n {name}(*{args}, **{kwargs})", + "content": f"Return {name}({call_args})", }, { "role": "assistant", @@ -197,3 +196,14 @@ def track( "functions": [openai_function_call], } self.logger.info(json.dumps(function_body)) + + if finetune_format == FinetuneFormat.RAW: + function_body = dict( + fn_name=name, + fn_repr=format_function(fn), + args=args, + kwargs=kwargs, + resp=resp.model_dump(), + schema=base_model.model_json_schema(), + ) + self.logger.info(json.dumps(function_body))