From f04a2f7632a6244b0b36670238b0b6be3801227a Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 08:42:55 -0600 Subject: [PATCH 01/12] Updated README and removed unused code. Co-authored-by: MindFlow --- README.md | 4 ++-- mindflow/core/index.py | 1 - mindflow/resolving/resolvers/base_resolver.py | 7 ------- mindflow/utils/command_parse.py | 4 +++- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index c15d1c9..3d8ee2a 100644 --- a/README.md +++ b/README.md @@ -78,8 +78,8 @@ Make some changes to your git repo and stage them. Then, run `mf commit`! You sh 1 file changed, 14 insertions(+) ``` -### Create PRs With GPT Titles And Body -Make some changes to your branch and stage, and then commit them. Then, run `mf pr`! A PR should be created with a title and body generated by GPT, and a link to the PR should be printed to the console. +### Create PRs/MRs With GPT Titles And Body +Make some changes to your branch and stage, and then commit them. Then, run `mf pr` for GitHub or `mf mr` for GitLab! A pull request/merge request should be created with a title and body generated by GPT, and a link to the PR should be printed to the console. - To use this feature, you must first install and authenticate the [GitHub CLI](https://cli.github.com/). ## How does it work? diff --git a/mindflow/core/index.py b/mindflow/core/index.py index d354ab2..acfdc6d 100644 --- a/mindflow/core/index.py +++ b/mindflow/core/index.py @@ -1,7 +1,6 @@ """ `generate` command """ -from asyncio import Future from concurrent.futures import ThreadPoolExecutor from copy import deepcopy from typing import List diff --git a/mindflow/resolving/resolvers/base_resolver.py b/mindflow/resolving/resolvers/base_resolver.py index 371557f..0de7f0b 100644 --- a/mindflow/resolving/resolvers/base_resolver.py +++ b/mindflow/resolving/resolvers/base_resolver.py @@ -2,7 +2,6 @@ Base Resolver Class """ from typing import List -from typing import Optional from mindflow.db.objects.document import DocumentReference @@ -12,12 +11,6 @@ class BaseResolver: Base class for resolvers """ - @staticmethod - def read_document(document_path: str) -> Optional[str]: - """ - Read a document. - """ - @staticmethod def should_resolve(document_path: str) -> bool: """ diff --git a/mindflow/utils/command_parse.py b/mindflow/utils/command_parse.py index 710842d..f9aa45e 100644 --- a/mindflow/utils/command_parse.py +++ b/mindflow/utils/command_parse.py @@ -1,6 +1,5 @@ from typing import List, Tuple, Optional - def get_flag_value(args: Tuple[str], flag: List[str]) -> Optional[str]: """ Gets the value of a flag in a list of arguments. @@ -15,6 +14,9 @@ def get_flag_value(args: Tuple[str], flag: List[str]) -> Optional[str]: def get_flag_bool(args: Tuple[str], flag: str) -> bool: + """ + Returns True if the flag is in the list of arguments. + """ try: return args.index(flag) >= 0 except: From e985b4b64f8cff7c24511d523c000f1c1f7ba094 Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 08:48:14 -0600 Subject: [PATCH 02/12] Add blank line at beginning of command_parse.py. Co-authored-by: MindFlow --- mindflow/utils/command_parse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mindflow/utils/command_parse.py b/mindflow/utils/command_parse.py index f9aa45e..e72b34c 100644 --- a/mindflow/utils/command_parse.py +++ b/mindflow/utils/command_parse.py @@ -1,5 +1,6 @@ from typing import List, Tuple, Optional + def get_flag_value(args: Tuple[str], flag: List[str]) -> Optional[str]: """ Gets the value of a flag in a list of arguments. From 51cabc8bc4af060e488b0c76925043665114f053 Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 10:19:24 -0600 Subject: [PATCH 03/12] Added and removed import statements, added test file, and updated functions for token count. Co-authored-by: MindFlow --- mindflow/core/chat.py | 12 +++++++-- mindflow/core/index.py | 18 ++++--------- mindflow/core/query.py | 26 ++++++++++++------- mindflow/db/db/static.py | 2 -- mindflow/db/objects/model.py | 6 +++++ .../static_definition/mind_flow_model.py | 3 --- .../db/objects/static_definition/model.py | 3 --- .../objects/static_definition/model_type.py | 2 -- .../db/objects/static_definition/object.py | 3 --- .../db/objects/static_definition/service.py | 3 --- mindflow/test.py | 7 +++++ mindflow/utils/prompts.py | 1 + mindflow/utils/token.py | 15 +++++++++++ 13 files changed, 61 insertions(+), 40 deletions(-) create mode 100644 mindflow/test.py create mode 100644 mindflow/utils/token.py diff --git a/mindflow/core/chat.py b/mindflow/core/chat.py index c37dcee..d44cd0c 100644 --- a/mindflow/core/chat.py +++ b/mindflow/core/chat.py @@ -1,4 +1,6 @@ from mindflow.settings import Settings +from mindflow.utils.prompts import CHAT_PROMPT_PREFIX +from mindflow.utils.token import get_token_count def run_chat(prompt: str) -> str: @@ -6,12 +8,18 @@ def run_chat(prompt: str) -> str: This function is used to generate a prompt and then use it as a prompt for GPT bot. """ settings = Settings() + completion_model = settings.mindflow_models.query.model + + if get_token_count(completion_model, CHAT_PROMPT_PREFIX+prompt) > completion_model.hard_token_limit: + print("The prompt is too long. Please try again with a shorter prompt.") + return "" + # Prompt GPT through Mindflow API or locally - response: str = settings.mindflow_models.query.model( + response: str = completion_model( [ { "role": "system", - "content": "You are a helpful virtual assistant responding to a users query using your general knowledge and the text provided below.", + "content": CHAT_PROMPT_PREFIX, }, {"role": "user", "content": prompt}, ] diff --git a/mindflow/core/index.py b/mindflow/core/index.py index acfdc6d..a642153 100644 --- a/mindflow/core/index.py +++ b/mindflow/core/index.py @@ -19,6 +19,7 @@ from mindflow.settings import Settings from mindflow.utils.prompt_builders import build_context_prompt from mindflow.utils.prompts import INDEX_PROMPT_PREFIX +from mindflow.utils.token import get_batch_token_count, get_token_count def run_index(document_paths: List[str], refresh: bool, force: bool) -> None: @@ -135,15 +136,6 @@ def iterative_to_dict(self) -> dict: return node_dict -def count_tokens(text: str) -> int: - """ - Counts/estimates the number of tokens this text will consume by GPT. - """ - # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - # count = len(tokenizer(text)['input_ids']) - return len(text) // 4 # Token Estimation for speed - - # This function is used to split a string into chunks of a specified token limit using binary search def binary_split_raw_text_to_nodes( completion_model: ConfiguredModel, text: str @@ -155,7 +147,7 @@ def binary_split_raw_text_to_nodes( stack = [(0, len(text))] while stack: start, end = stack.pop() - if count_tokens(text[start:end]) < completion_model.soft_token_limit: + if get_token_count(completion_model, text[start:end]) < completion_model.soft_token_limit: nodes.append(Node(completion_model, start, end, text[start:end])) else: mid = ((end - start) // 2) + start @@ -175,7 +167,7 @@ def binary_split_nodes_to_chunks( while stack: nodes, start, end = stack.pop() if ( - sum(count_tokens(node.summary) for node in nodes[start:end]) + get_batch_token_count(completion_model, [node.summary for node in nodes[start:end]]) < completion_model.soft_token_limit ): chunks.append(nodes[start:end]) @@ -194,7 +186,7 @@ def create_nodes(completion_model: ConfiguredModel, leaf_nodes: List[Node]) -> N while stack: leaf_nodes, start, end = stack.pop() if ( - sum(count_tokens(leaf_node.summary) for leaf_node in leaf_nodes[start:end]) + get_batch_token_count(completion_model, [leaf_node.summary for leaf_node in leaf_nodes[start:end]]) > completion_model.soft_token_limit ): node_chunks: List[List[Node]] = binary_split_nodes_to_chunks( @@ -221,7 +213,7 @@ def create_text_search_tree(completion_model: ConfiguredModel, text: str) -> dic """ This function is used to create a tree of responses from the OpenAI API """ - if count_tokens(text) < completion_model.soft_token_limit: + if get_token_count(completion_model, text) < completion_model.soft_token_limit: return Node(completion_model, 0, len(text), text).to_dict() return create_nodes( diff --git a/mindflow/core/query.py b/mindflow/core/query.py index d5075a4..8afa11e 100644 --- a/mindflow/core/query.py +++ b/mindflow/core/query.py @@ -17,6 +17,7 @@ from mindflow.db.objects.model import ConfiguredModel from mindflow.resolving.resolve import resolve_all from mindflow.settings import Settings +from mindflow.utils.token import get_token_count def run_query(document_paths: List[str], query: str): @@ -33,7 +34,7 @@ def run_query(document_paths: List[str], query: str): select_content( query, document_references, - completion_model.hard_token_limit, + completion_model, embedding_model, ), ) @@ -58,7 +59,7 @@ def build_query_messages(query: str, content: str) -> List[Dict]: def select_content( query: str, document_references: List[DocumentReference], - token_limit: int, + completion_model: ConfiguredModel, embedding_model: ConfiguredModel, ) -> str: """ @@ -73,7 +74,7 @@ def select_content( ) sys.exit(1) - selected_content = trim_content(embedding_ranked_document_chunks, token_limit) + selected_content = trim_content(embedding_ranked_document_chunks, completion_model) return selected_content @@ -129,22 +130,29 @@ def from_search_tree( return chunks, embeddings -def trim_content(ranked_document_chunks: List[DocumentChunk], token_limit: int) -> str: +def trim_content(ranked_document_chunks: List[DocumentChunk], model: ConfiguredModel) -> str: """ This function is used to select the most relevant content for the prompt. """ selected_content: str = "" - char_limit: int = token_limit * 3 for document_chunk in ranked_document_chunks: if document_chunk: with open(document_chunk.path, "r", encoding="utf-8") as file: file.seek(document_chunk.start) text = file.read(document_chunk.end - document_chunk.start) - if len(selected_content + text) > char_limit: - selected_content += text[: char_limit - len(selected_content)] - break - selected_content += text + + # Perform a binary search to find the maximum amount of text that fits within the token limit + left, right = 0, len(text) + while left <= right: + mid = (left + right) // 2 + if get_token_count(model, selected_content + text[:mid]) <= model.hard_token_limit: + left = mid + 1 + else: + right = mid - 1 + + # Add the selected text to the selected content + selected_content += text[:right] return selected_content diff --git a/mindflow/db/db/static.py b/mindflow/db/db/static.py index c32fae3..d1d409a 100644 --- a/mindflow/db/db/static.py +++ b/mindflow/db/db/static.py @@ -1,5 +1,3 @@ -from typing import Union - from mindflow.db.db.database import Collection from mindflow.db.db.database import Database from mindflow.db.objects.static_definition.mind_flow_model import MINDFLOW_MODEL_STATIC diff --git a/mindflow/db/objects/model.py b/mindflow/db/objects/model.py index ecc7827..803e623 100644 --- a/mindflow/db/objects/model.py +++ b/mindflow/db/objects/model.py @@ -3,6 +3,8 @@ import openai from traitlets import Callable +import tiktoken + from mindflow.db.db.database import Collection from mindflow.db.objects.base import BaseObject from mindflow.db.objects.base import StaticObject @@ -43,6 +45,7 @@ class ConfiguredModel(Callable): name: str service: str model_type: str + tokenizer: tiktoken.Encoding hard_token_limit: int token_cost: int token_cost_unit: str @@ -64,6 +67,9 @@ def __init__(self, model_id: str): if value not in [None, ""]: setattr(self, key, value) + if self.service == ServiceID.OPENAI.value: + self.tokenizer = tiktoken.encoding_for_model(self.id) + service_config = ServiceConfig.load(f"{self.service}_config") self.api_key = service_config.api_key diff --git a/mindflow/db/objects/static_definition/mind_flow_model.py b/mindflow/db/objects/static_definition/mind_flow_model.py index 94b904c..a4bfd1e 100644 --- a/mindflow/db/objects/static_definition/mind_flow_model.py +++ b/mindflow/db/objects/static_definition/mind_flow_model.py @@ -1,6 +1,3 @@ -from typing import Type -from typing import Union - from mindflow.db.objects.static_definition.model import ModelID from mindflow.db.objects.static_definition.model_type import ModelType from mindflow.utils.enum import ExtendedEnum diff --git a/mindflow/db/objects/static_definition/model.py b/mindflow/db/objects/static_definition/model.py index e261356..fb63d6c 100644 --- a/mindflow/db/objects/static_definition/model.py +++ b/mindflow/db/objects/static_definition/model.py @@ -1,6 +1,3 @@ -from typing import Type -from typing import Union - from mindflow.db.objects.static_definition.model_type import ModelType from mindflow.utils.enum import ExtendedEnum diff --git a/mindflow/db/objects/static_definition/model_type.py b/mindflow/db/objects/static_definition/model_type.py index b0a4d1c..7e74c73 100644 --- a/mindflow/db/objects/static_definition/model_type.py +++ b/mindflow/db/objects/static_definition/model_type.py @@ -1,5 +1,3 @@ -from typing import Union - from mindflow.utils.enum import ExtendedEnum diff --git a/mindflow/db/objects/static_definition/object.py b/mindflow/db/objects/static_definition/object.py index 21cbd25..70d8492 100644 --- a/mindflow/db/objects/static_definition/object.py +++ b/mindflow/db/objects/static_definition/object.py @@ -1,6 +1,3 @@ -from typing import Type -from typing import Union - from mindflow.db.objects.document import Document from mindflow.db.objects.mindflow_model import MindFlowModel from mindflow.db.objects.mindflow_model import MindFlowModelConfig diff --git a/mindflow/db/objects/static_definition/service.py b/mindflow/db/objects/static_definition/service.py index abbb3ba..cb57d31 100644 --- a/mindflow/db/objects/static_definition/service.py +++ b/mindflow/db/objects/static_definition/service.py @@ -1,6 +1,3 @@ -from typing import Type -from typing import Union - from mindflow.db.objects.static_definition.model import ModelID from mindflow.db.objects.static_definition.model import ModelOpenAI from mindflow.db.objects.static_definition.model import ModelTextCompletionOpenAI diff --git a/mindflow/test.py b/mindflow/test.py new file mode 100644 index 0000000..268f479 --- /dev/null +++ b/mindflow/test.py @@ -0,0 +1,7 @@ +import tiktoken + +tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")\ + + +print(tokenizer.encode_batch(["Hello, world!", "Whats up?"])) + diff --git a/mindflow/utils/prompts.py b/mindflow/utils/prompts.py index cc1916e..8b1be87 100644 --- a/mindflow/utils/prompts.py +++ b/mindflow/utils/prompts.py @@ -16,6 +16,7 @@ yet condensed string that can serve as an index for the contents or the purpose \ of a file. I want you to respond in as few words as possible while still conveying \ the full content and purpose of this file." +CHAT_PROMPT_PREFIX = "You are a helpful virtual assistant responding to a users query using your general knowledge and the text provided below." COMMIT_PROMPT_PREFIX = "Please provide a commit message for the following changes. Only respond with the commit message and nothing else." PR_TITLE_PREFIX = "Please provide a title for the following pull request using this git diff summary. Only respond with the title and nothing else." PR_BODY_PREFIX = "Please provide a body for the following pull request using this git diff summary. I want you to keep it high level, and give core \ diff --git a/mindflow/utils/token.py b/mindflow/utils/token.py new file mode 100644 index 0000000..7654570 --- /dev/null +++ b/mindflow/utils/token.py @@ -0,0 +1,15 @@ +from typing import List +from mindflow.db.objects.model import ConfiguredModel + + +def get_token_count(model: ConfiguredModel, text: str) -> int: + """ + This function is used to get the token count of a string. + """ + return len(model.tokenizer.encode(text)) + +def get_batch_token_count(model: ConfiguredModel, texts: List[str]) -> int: + """ + This function is used to get the token count of a list of strings. + """ + return sum([len(encoding) for encoding in model.tokenizer.encode_batch(texts)]) \ No newline at end of file From 970ef35e6126865c96360b9103354ebe0a8e2538 Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 10:21:36 -0600 Subject: [PATCH 04/12] Updated token limits and added line breaks to functions and files. Co-authored-by: MindFlow --- mindflow/core/chat.py | 7 +++++-- mindflow/core/index.py | 14 +++++++++++--- mindflow/core/query.py | 11 ++++++++--- mindflow/test.py | 4 +--- mindflow/utils/token.py | 3 ++- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/mindflow/core/chat.py b/mindflow/core/chat.py index d44cd0c..f284f7b 100644 --- a/mindflow/core/chat.py +++ b/mindflow/core/chat.py @@ -9,8 +9,11 @@ def run_chat(prompt: str) -> str: """ settings = Settings() completion_model = settings.mindflow_models.query.model - - if get_token_count(completion_model, CHAT_PROMPT_PREFIX+prompt) > completion_model.hard_token_limit: + + if ( + get_token_count(completion_model, CHAT_PROMPT_PREFIX + prompt) + > completion_model.hard_token_limit + ): print("The prompt is too long. Please try again with a shorter prompt.") return "" diff --git a/mindflow/core/index.py b/mindflow/core/index.py index a642153..9fe395d 100644 --- a/mindflow/core/index.py +++ b/mindflow/core/index.py @@ -147,7 +147,10 @@ def binary_split_raw_text_to_nodes( stack = [(0, len(text))] while stack: start, end = stack.pop() - if get_token_count(completion_model, text[start:end]) < completion_model.soft_token_limit: + if ( + get_token_count(completion_model, text[start:end]) + < completion_model.soft_token_limit + ): nodes.append(Node(completion_model, start, end, text[start:end])) else: mid = ((end - start) // 2) + start @@ -167,7 +170,9 @@ def binary_split_nodes_to_chunks( while stack: nodes, start, end = stack.pop() if ( - get_batch_token_count(completion_model, [node.summary for node in nodes[start:end]]) + get_batch_token_count( + completion_model, [node.summary for node in nodes[start:end]] + ) < completion_model.soft_token_limit ): chunks.append(nodes[start:end]) @@ -186,7 +191,10 @@ def create_nodes(completion_model: ConfiguredModel, leaf_nodes: List[Node]) -> N while stack: leaf_nodes, start, end = stack.pop() if ( - get_batch_token_count(completion_model, [leaf_node.summary for leaf_node in leaf_nodes[start:end]]) + get_batch_token_count( + completion_model, + [leaf_node.summary for leaf_node in leaf_nodes[start:end]], + ) > completion_model.soft_token_limit ): node_chunks: List[List[Node]] = binary_split_nodes_to_chunks( diff --git a/mindflow/core/query.py b/mindflow/core/query.py index 8afa11e..9b1ece2 100644 --- a/mindflow/core/query.py +++ b/mindflow/core/query.py @@ -130,7 +130,9 @@ def from_search_tree( return chunks, embeddings -def trim_content(ranked_document_chunks: List[DocumentChunk], model: ConfiguredModel) -> str: +def trim_content( + ranked_document_chunks: List[DocumentChunk], model: ConfiguredModel +) -> str: """ This function is used to select the most relevant content for the prompt. """ @@ -146,11 +148,14 @@ def trim_content(ranked_document_chunks: List[DocumentChunk], model: ConfiguredM left, right = 0, len(text) while left <= right: mid = (left + right) // 2 - if get_token_count(model, selected_content + text[:mid]) <= model.hard_token_limit: + if ( + get_token_count(model, selected_content + text[:mid]) + <= model.hard_token_limit + ): left = mid + 1 else: right = mid - 1 - + # Add the selected text to the selected content selected_content += text[:right] diff --git a/mindflow/test.py b/mindflow/test.py index 268f479..83f5199 100644 --- a/mindflow/test.py +++ b/mindflow/test.py @@ -1,7 +1,5 @@ import tiktoken -tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")\ - +tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") print(tokenizer.encode_batch(["Hello, world!", "Whats up?"])) - diff --git a/mindflow/utils/token.py b/mindflow/utils/token.py index 7654570..9c0aeb2 100644 --- a/mindflow/utils/token.py +++ b/mindflow/utils/token.py @@ -8,8 +8,9 @@ def get_token_count(model: ConfiguredModel, text: str) -> int: """ return len(model.tokenizer.encode(text)) + def get_batch_token_count(model: ConfiguredModel, texts: List[str]) -> int: """ This function is used to get the token count of a list of strings. """ - return sum([len(encoding) for encoding in model.tokenizer.encode_batch(texts)]) \ No newline at end of file + return sum([len(encoding) for encoding in model.tokenizer.encode_batch(texts)]) From 4b4381162109f644c7bcef8e8bfa83aa4535b68a Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 10:22:05 -0600 Subject: [PATCH 05/12] Updated version number to 0.3.13. Co-authored-by: MindFlow --- mindflow/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindflow/__init__.py b/mindflow/__init__.py index df0ed33..8a3be2e 100644 --- a/mindflow/__init__.py +++ b/mindflow/__init__.py @@ -1 +1 @@ -__version__ = "0.3.12" +__version__ = "0.3.13" From 63456b7dc6489e0235fbdb1f7335e987f05baa83 Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 10:54:49 -0600 Subject: [PATCH 06/12] Added typing.Optional and error handling to various functions and classes Co-authored-by: MindFlow --- mindflow/core/chat.py | 6 +++- mindflow/core/git/commit.py | 4 ++- mindflow/core/git/diff.py | 15 +++++++--- mindflow/core/index.py | 7 ++++- mindflow/db/objects/mindflow_model.py | 2 +- mindflow/db/objects/model.py | 42 +++++++++++++++++---------- requirements.txt | 1 + 7 files changed, 53 insertions(+), 24 deletions(-) diff --git a/mindflow/core/chat.py b/mindflow/core/chat.py index f284f7b..757ef1d 100644 --- a/mindflow/core/chat.py +++ b/mindflow/core/chat.py @@ -1,3 +1,4 @@ +from typing import Optional from mindflow.settings import Settings from mindflow.utils.prompts import CHAT_PROMPT_PREFIX from mindflow.utils.token import get_token_count @@ -18,7 +19,7 @@ def run_chat(prompt: str) -> str: return "" # Prompt GPT through Mindflow API or locally - response: str = completion_model( + response: Optional[str] = completion_model( [ { "role": "system", @@ -27,4 +28,7 @@ def run_chat(prompt: str) -> str: {"role": "user", "content": prompt}, ] ) + + if response is None: + return "Unable to generate response. Please try again. If the problem persists, please raise an issue at: https://github.com/nollied/mindflow-cli/issues." return response diff --git a/mindflow/core/git/commit.py b/mindflow/core/git/commit.py index d3c0be9..38da009 100644 --- a/mindflow/core/git/commit.py +++ b/mindflow/core/git/commit.py @@ -20,9 +20,11 @@ def run_commit(args: Tuple[str], message_overwrite: Optional[str] = None) -> str if diff_output == "No staged changes.": return diff_output - response: str = settings.mindflow_models.query.model( + response: Optional[str] = settings.mindflow_models.query.model( build_context_prompt(COMMIT_PROMPT_PREFIX, diff_output) ) + if response is None: + return "Unable to generate a commit message. Please try again - this may be a temporary issue with the OpenAI API. If the problem persists, please raise an issue at: https://github.com/nollied/mindflow-cli/issues" # add co-authorship to commit message response += "\n\nCo-authored-by: MindFlow " diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py index eddaa41..9771942 100644 --- a/mindflow/core/git/diff.py +++ b/mindflow/core/git/diff.py @@ -3,9 +3,8 @@ """ import concurrent.futures import subprocess -from typing import Dict +from typing import Dict, Optional from typing import List -from typing import Optional from typing import Tuple from mindflow.db.objects.model import ConfiguredModel @@ -13,7 +12,7 @@ from mindflow.utils.prompt_builders import build_context_prompt from mindflow.utils.prompts import GIT_DIFF_PROMPT_PREFIX -from mindflow.utils.diff_parser import parse_git_diff, IGNORE_FILE_EXTENSIONS +from mindflow.utils.diff_parser import parse_git_diff def run_diff(args: Tuple[str]) -> str: @@ -47,6 +46,8 @@ def run_diff(args: Tuple[str]) -> str: response = completion_model( build_context_prompt(GIT_DIFF_PROMPT_PREFIX, content) ) + if response is None: + print("Warning: model failed to return response for diff. Please raise issue on github if this persists: https://github.com/nollied/mindflow-cli/issues") else: with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] @@ -62,7 +63,13 @@ def run_diff(args: Tuple[str]) -> str: # Process the results as they become available for future in concurrent.futures.as_completed(futures): - response += future.result() + partial_response: Optional[str] = future.result() + if partial_response is None: + print("Warning: model failed to return response for part of, or entire, diff. Please raise issue on github if this persists: https://github.com/nollied/mindflow-cli/issues") + partial_response = "" + + response += partial_response + if len(excluded_filenames) > 0: response += f"\n\nNOTE: The following files were excluded from the diff: {', '.join(excluded_filenames)}" diff --git a/mindflow/core/index.py b/mindflow/core/index.py index 9fe395d..678a7c4 100644 --- a/mindflow/core/index.py +++ b/mindflow/core/index.py @@ -3,6 +3,7 @@ """ from concurrent.futures import ThreadPoolExecutor from copy import deepcopy +import logging from typing import List from typing import Optional @@ -97,9 +98,13 @@ def __init__( self.start = start self.end = end if text: - self.summary = completion_model( + self.summary: Optional[str] = completion_model( build_context_prompt(INDEX_PROMPT_PREFIX, text) ) + if self.summary is None: + self.summary = "" + logging.warning("Unable to generate summary for node.") + def set_leaves(self, leaves: List["Node"]) -> None: self.leaves = leaves diff --git a/mindflow/db/objects/mindflow_model.py b/mindflow/db/objects/mindflow_model.py index 77b2e5b..3c68449 100644 --- a/mindflow/db/objects/mindflow_model.py +++ b/mindflow/db/objects/mindflow_model.py @@ -57,7 +57,7 @@ def __init__(self, mindflow_model_id: str, configured_services: ConfiguredServic if model_id is None: model_id = self.get_default_model_id(mindflow_model_id, configured_services) - self.model = ConfiguredModel(model_id) + self.model = ConfiguredModel(model_id, mindflow_model_id) def get_default_model_id( self, mindflow_model_id: str, configured_services: ConfiguredServices diff --git a/mindflow/db/objects/model.py b/mindflow/db/objects/model.py index 803e623..4661a80 100644 --- a/mindflow/db/objects/model.py +++ b/mindflow/db/objects/model.py @@ -9,6 +9,7 @@ from mindflow.db.objects.base import BaseObject from mindflow.db.objects.base import StaticObject from mindflow.db.objects.service import ServiceConfig +from mindflow.db.objects.static_definition.mind_flow_model import MindFlowModelID from mindflow.db.objects.static_definition.model import ModelID from mindflow.db.objects.static_definition.service import ServiceID @@ -54,7 +55,10 @@ class ConfiguredModel(Callable): soft_token_limit: int api_key: str - def __init__(self, model_id: str): + # MindFlow Model That instantiated this model + _mindflow_model_id: str = None + + def __init__(self, model_id: str, mindflow_model_id: str): model = Model.load(model_id) model_config = ModelConfig.load(f"{model_id}_config") @@ -79,21 +83,27 @@ def openai_chat_completion( max_tokens: int = 500, temperature: float = 0.0, stop: Optional[list] = None, - ): - openai.api_key = self.api_key - return openai.ChatCompletion.create( - model=self.id, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - stop=stop, - )["choices"][0]["message"]["content"] - - def openai_embedding(self, text: str): - openai.api_key = self.api_key - return openai.Embedding.create(engine=self.id, input=text)["data"][0][ - "embedding" - ] + ) -> Optional[str]: + try: + openai.api_key = self.api_key + return openai.ChatCompletion.create( + model=self.id, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + stop=stop, + )["choices"][0]["message"]["content"] + except Exception as e: + return None + + def openai_embedding(self, text: str) -> Optional[list]: + try: + openai.api_key = self.api_key + return openai.Embedding.create(engine=self.id, input=text)["data"][0][ + "embedding" + ] + except Exception as e: + return None def __call__(self, prompt, *args, **kwargs): if self.service == ServiceID.OPENAI.value: diff --git a/requirements.txt b/requirements.txt index f7c5ba5..a9c97e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ alive-progress click numpy openai==0.27.0 +tiktoken==0.3.0 pytest scikit-learn tqdm From a13911dc3aa8d819e7d7de7a95b56ae402091edf Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 11:08:46 -0600 Subject: [PATCH 07/12] Refactored `run_mr` and `run_pr` to use tuple returned by `create_title_and_body`, added empty tuple check. Changed `create_title_and_body` to return optional tuple, added check for `None` values in `title` and `body`. Added new file `errors.py`. Co-authored-by: MindFlow --- mindflow/core/git/mr.py | 7 ++++++- mindflow/core/git/pr.py | 15 +++++++++++---- mindflow/utils/errors.py | 0 3 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 mindflow/utils/errors.py diff --git a/mindflow/core/git/mr.py b/mindflow/core/git/mr.py index 96bf8f7..fc1cbf3 100644 --- a/mindflow/core/git/mr.py +++ b/mindflow/core/git/mr.py @@ -32,7 +32,12 @@ def run_mr( return if not title or not description: - title, description = create_title_and_body(base_branch, title, description) + tital_description_tuple = create_title_and_body(base_branch, title, description) + + if not tital_description_tuple: + return + + title, description = tital_description_tuple create_merge_request(args, title, description) diff --git a/mindflow/core/git/pr.py b/mindflow/core/git/pr.py index 5b6df5f..2afd5c6 100644 --- a/mindflow/core/git/pr.py +++ b/mindflow/core/git/pr.py @@ -35,8 +35,12 @@ def run_pr(args: Tuple[str], title: Optional[str] = None, body: Optional[str] = return if not title or not body: - title, body = create_title_and_body(base_branch, title, body) - + tital_body_tuple = create_title_and_body(base_branch, title, body) + + if not tital_body_tuple: + return + + title, body = tital_body_tuple create_pull_request(args, title, body) @@ -58,7 +62,7 @@ def is_valid_pr(head_branch: str, base_branch: str) -> bool: def create_title_and_body( base_branch, title: Optional[str], body: Optional[str] -) -> Tuple[str, str]: +) -> Optional[Tuple[str, str]]: settings = Settings() diff_output = run_diff((base_branch,)) @@ -84,7 +88,10 @@ def create_title_and_body( if body is None: pr_body_prompt = build_context_prompt(PR_BODY_PREFIX, diff_output) body = settings.mindflow_models.query.model(pr_body_prompt) - + + if title is None or body is None: + print("Unable to generate a pull request title and body. Please try again - this may be a temporary issue with the OpenAI API. If the problem persists, please raise an issue at: https://github.com/nollied/mindflow-cli/issues") + return None return title, body diff --git a/mindflow/utils/errors.py b/mindflow/utils/errors.py new file mode 100644 index 0000000..e69de29 From 2abc98cb24616153fa5311990fcd4e07db575339 Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 14:02:37 -0600 Subject: [PATCH 08/12] Added error handling and import statements for various modules and classes. Co-authored-by: MindFlow --- mindflow/core/chat.py | 3 +- mindflow/core/git/commit.py | 9 +++--- mindflow/core/git/diff.py | 61 +++++++++++++++++++++++------------- mindflow/core/git/pr.py | 23 +++++++++----- mindflow/core/index.py | 11 ++++--- mindflow/core/query.py | 32 +++++++++++++------ mindflow/db/objects/model.py | 19 +++++------ mindflow/utils/constants.py | 6 ++++ mindflow/utils/errors.py | 46 +++++++++++++++++++++++++++ 9 files changed, 152 insertions(+), 58 deletions(-) create mode 100644 mindflow/utils/constants.py diff --git a/mindflow/core/chat.py b/mindflow/core/chat.py index 757ef1d..dbdcde7 100644 --- a/mindflow/core/chat.py +++ b/mindflow/core/chat.py @@ -1,5 +1,6 @@ from typing import Optional from mindflow.settings import Settings +from mindflow.utils.constants import MinimumReservedLength from mindflow.utils.prompts import CHAT_PROMPT_PREFIX from mindflow.utils.token import get_token_count @@ -13,7 +14,7 @@ def run_chat(prompt: str) -> str: if ( get_token_count(completion_model, CHAT_PROMPT_PREFIX + prompt) - > completion_model.hard_token_limit + > completion_model.hard_token_limit - MinimumReservedLength.CHAT.value ): print("The prompt is too long. Please try again with a shorter prompt.") return "" diff --git a/mindflow/core/git/commit.py b/mindflow/core/git/commit.py index 38da009..ff64368 100644 --- a/mindflow/core/git/commit.py +++ b/mindflow/core/git/commit.py @@ -1,8 +1,9 @@ import subprocess -from typing import Tuple, Optional +from typing import Tuple, Optional, Union from mindflow.core.git.diff import run_diff from mindflow.settings import Settings +from mindflow.utils.errors import ModelError from mindflow.utils.prompt_builders import build_context_prompt from mindflow.utils.prompts import COMMIT_PROMPT_PREFIX @@ -20,11 +21,11 @@ def run_commit(args: Tuple[str], message_overwrite: Optional[str] = None) -> str if diff_output == "No staged changes.": return diff_output - response: Optional[str] = settings.mindflow_models.query.model( + response: Union[ModelError, str] = settings.mindflow_models.query.model( build_context_prompt(COMMIT_PROMPT_PREFIX, diff_output) ) - if response is None: - return "Unable to generate a commit message. Please try again - this may be a temporary issue with the OpenAI API. If the problem persists, please raise an issue at: https://github.com/nollied/mindflow-cli/issues" + if isinstance(response, ModelError): + return response.commit_message # add co-authorship to commit message response += "\n\nCo-authored-by: MindFlow " diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py index 9771942..d401fc4 100644 --- a/mindflow/core/git/diff.py +++ b/mindflow/core/git/diff.py @@ -3,16 +3,19 @@ """ import concurrent.futures import subprocess -from typing import Dict, Optional +from typing import Dict, Union from typing import List from typing import Tuple from mindflow.db.objects.model import ConfiguredModel from mindflow.settings import Settings +from mindflow.utils.constants import MinimumReservedLength +from mindflow.utils.errors import ModelError from mindflow.utils.prompt_builders import build_context_prompt from mindflow.utils.prompts import GIT_DIFF_PROMPT_PREFIX from mindflow.utils.diff_parser import parse_git_diff +from mindflow.utils.token import get_token_count def run_diff(args: Tuple[str]) -> str: @@ -35,7 +38,7 @@ def run_diff(args: Tuple[str]) -> str: return "No staged changes." batched_parsed_diff_result = batch_git_diffs( - diff_dict, token_limit=completion_model.hard_token_limit + diff_dict, completion_model ) response: str = "" @@ -43,11 +46,11 @@ def run_diff(args: Tuple[str]) -> str: content = "" for file_name, diff_content in batched_parsed_diff_result[0]: content += f"*{file_name}*\n DIFF CONTENT: {diff_content}\n\n" - response = completion_model( + response: Union[ModelError, str] = completion_model( build_context_prompt(GIT_DIFF_PROMPT_PREFIX, content) ) - if response is None: - print("Warning: model failed to return response for diff. Please raise issue on github if this persists: https://github.com/nollied/mindflow-cli/issues") + if isinstance(response, ModelError): + print(response.diff_message) else: with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] @@ -63,10 +66,10 @@ def run_diff(args: Tuple[str]) -> str: # Process the results as they become available for future in concurrent.futures.as_completed(futures): - partial_response: Optional[str] = future.result() - if partial_response is None: - print("Warning: model failed to return response for part of, or entire, diff. Please raise issue on github if this persists: https://github.com/nollied/mindflow-cli/issues") - partial_response = "" + partial_response: Union[ModelError, str] = future.result() + if isinstance(partial_response, ModelError): + print(partial_response.diff_partial_message) + continue response += partial_response @@ -81,31 +84,45 @@ def run_diff(args: Tuple[str]) -> str: def batch_git_diffs( - file_diffs: Dict[str, str], token_limit: int + file_diffs: Dict[str, str], model: ConfiguredModel ) -> List[List[Tuple[str, str]]]: batches = [] current_batch: List = [] - current_batch_size = 0 + current_batch_text = "" for file_name, diff_content in file_diffs.items(): - if len(diff_content) > token_limit: - chunks = [ - diff_content[i : i + token_limit] - for i in range(0, len(diff_content), token_limit) - ] + if get_token_count(model, diff_content) > model.hard_token_limit - MinimumReservedLength.DIFF.value: + ## Split the diff into chunks that are less than the token limit + chunks = [diff_content] + while True: + new_chunks = [] + for chunk in chunks: + if get_token_count(model, chunk) > model.hard_token_limit - MinimumReservedLength.DIFF.value: + half_len = len(chunk) // 2 + left_half = chunk[:half_len] + right_half = chunk[half_len:] + new_chunks.extend([left_half, right_half]) + else: + new_chunks.append(chunk) + if new_chunks == chunks: + break + chunks = new_chunks + + ## Add the chunks to the batch or multiple batches for chunk in chunks: - if current_batch_size + len(chunk) > token_limit * 2: + if get_token_count(model, current_batch_text+chunk) > model.hard_token_limit - MinimumReservedLength.DIFF.value: batches.append(current_batch) current_batch = [] - current_batch_size = 0 + current_batch_text = "" current_batch.append((file_name, chunk)) - current_batch_size += len(chunk) - elif current_batch_size + len(diff_content) > token_limit * 2: + current_batch_text += chunk + + elif get_token_count(model, current_batch_text+diff_content) > model.hard_token_limit - MinimumReservedLength.DIFF.value: batches.append(current_batch) current_batch = [(file_name, diff_content)] - current_batch_size = len(diff_content) + current_batch_text = diff_content else: current_batch.append((file_name, diff_content)) - current_batch_size += len(diff_content) + current_batch_text += diff_content if current_batch: batches.append(current_batch) return batches diff --git a/mindflow/core/git/pr.py b/mindflow/core/git/pr.py index 2afd5c6..54b6707 100644 --- a/mindflow/core/git/pr.py +++ b/mindflow/core/git/pr.py @@ -1,10 +1,11 @@ import concurrent.futures import subprocess -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Union from mindflow.core.git.diff import run_diff from mindflow.settings import Settings from mindflow.utils.command_parse import get_flag_value +from mindflow.utils.errors import ModelError from mindflow.utils.prompt_builders import build_context_prompt from mindflow.utils.prompts import PR_BODY_PREFIX from mindflow.utils.prompts import PR_TITLE_PREFIX @@ -79,19 +80,25 @@ def create_title_and_body( settings.mindflow_models.query.model, pr_body_prompt ) - title = future_title.result() - body = future_body.result() + title_response: Union[ModelError, str] = future_title.result() + body_response: Union[ModelError, str] = future_body.result() else: - if title is None: + if title_response is None: pr_title_prompt = build_context_prompt(PR_TITLE_PREFIX, diff_output) - title = settings.mindflow_models.query.model(pr_title_prompt) + title_response: Union[ModelError, str] = settings.mindflow_models.query.model(pr_title_prompt) if body is None: pr_body_prompt = build_context_prompt(PR_BODY_PREFIX, diff_output) - body = settings.mindflow_models.query.model(pr_body_prompt) + body_response: Union[ModelError, str] = settings.mindflow_models.query.model(pr_body_prompt) - if title is None or body is None: - print("Unable to generate a pull request title and body. Please try again - this may be a temporary issue with the OpenAI API. If the problem persists, please raise an issue at: https://github.com/nollied/mindflow-cli/issues") + if isinstance(title_response, ModelError): + print(title_response.pr_message) return None + if isinstance(body_response, ModelError): + print(body_response.pr_message) + return None + + title = title_response if title is None else title + body = body_response if body is None else body return title, body diff --git a/mindflow/core/index.py b/mindflow/core/index.py index 678a7c4..5f645ea 100644 --- a/mindflow/core/index.py +++ b/mindflow/core/index.py @@ -4,7 +4,7 @@ from concurrent.futures import ThreadPoolExecutor from copy import deepcopy import logging -from typing import List +from typing import List, Union from typing import Optional import numpy as np @@ -18,6 +18,7 @@ from mindflow.resolving.resolve import resolve_all from mindflow.resolving.resolve import return_if_indexable from mindflow.settings import Settings +from mindflow.utils.errors import ModelError from mindflow.utils.prompt_builders import build_context_prompt from mindflow.utils.prompts import INDEX_PROMPT_PREFIX from mindflow.utils.token import get_batch_token_count, get_token_count @@ -98,12 +99,14 @@ def __init__( self.start = start self.end = end if text: - self.summary: Optional[str] = completion_model( + response: Union[str, ModelError] = completion_model( build_context_prompt(INDEX_PROMPT_PREFIX, text) ) - if self.summary is None: + if isinstance(response, ModelError): self.summary = "" - logging.warning("Unable to generate summary for node.") + print(response.index_message) + else: + self.summary = response def set_leaves(self, leaves: List["Node"]) -> None: diff --git a/mindflow/core/query.py b/mindflow/core/query.py index 9b1ece2..2b8e6ef 100644 --- a/mindflow/core/query.py +++ b/mindflow/core/query.py @@ -4,7 +4,7 @@ import sys from concurrent.futures import as_completed from concurrent.futures import ThreadPoolExecutor -from typing import Dict +from typing import Dict, Union from typing import List from typing import Optional from typing import Tuple @@ -17,6 +17,8 @@ from mindflow.db.objects.model import ConfiguredModel from mindflow.resolving.resolve import resolve_all from mindflow.settings import Settings +from mindflow.utils.constants import MinimumReservedLength +from mindflow.utils.errors import ModelError from mindflow.utils.token import get_token_count @@ -38,7 +40,10 @@ def run_query(document_paths: List[str], query: str): embedding_model, ), ) - response = completion_model(messages) + response: Union[ModelError, str] = completion_model(messages) + if isinstance(response, ModelError): + return response.query_message + return response @@ -74,7 +79,7 @@ def select_content( ) sys.exit(1) - selected_content = trim_content(embedding_ranked_document_chunks, completion_model) + selected_content = trim_content(embedding_ranked_document_chunks, completion_model, query) return selected_content @@ -110,8 +115,13 @@ def from_search_tree( document.search_tree["end"], ) ] + embedding_response: Union[ModelError, np.ndarray] = embedding_model(document.search_tree["summary"]) + if isinstance(embedding_response, ModelError): + print(embedding_response.embedding_message) + return [], [] + embeddings: List[np.ndarray] = [ - embedding_model(document.search_tree["summary"]) + embedding_response ] rolling_summary: List[str] = [] while stack: @@ -121,17 +131,20 @@ def from_search_tree( for leaf in node["leaves"]: stack.append(leaf) chunks.append(cls(document.path, leaf["start"], leaf["end"])) - rolling_summary_embedding = embedding_model( + rolling_summary_embedding_response: Union[np.ndarray, ModelError] = embedding_model( "\n\n".join(rolling_summary) + "\n\n" + leaf["summary"], ) - embeddings.append(rolling_summary_embedding) + if isinstance(rolling_summary_embedding_response, ModelError): + print(rolling_summary_embedding_response.embedding_message) + continue + embeddings.append(rolling_summary_embedding_response) rolling_summary.pop() return chunks, embeddings def trim_content( - ranked_document_chunks: List[DocumentChunk], model: ConfiguredModel + ranked_document_chunks: List[DocumentChunk], model: ConfiguredModel, query: str ) -> str: """ This function is used to select the most relevant content for the prompt. @@ -149,8 +162,8 @@ def trim_content( while left <= right: mid = (left + right) // 2 if ( - get_token_count(model, selected_content + text[:mid]) - <= model.hard_token_limit + get_token_count(model, query + selected_content + text[:mid]) + <= model.hard_token_limit - MinimumReservedLength.QUERY.value ): left = mid + 1 else: @@ -158,7 +171,6 @@ def trim_content( # Add the selected text to the selected content selected_content += text[:right] - return selected_content diff --git a/mindflow/db/objects/model.py b/mindflow/db/objects/model.py index 4661a80..c127110 100644 --- a/mindflow/db/objects/model.py +++ b/mindflow/db/objects/model.py @@ -1,6 +1,7 @@ -from typing import Optional +from typing import Optional, Union import openai +import numpy as np from traitlets import Callable import tiktoken @@ -9,9 +10,9 @@ from mindflow.db.objects.base import BaseObject from mindflow.db.objects.base import StaticObject from mindflow.db.objects.service import ServiceConfig -from mindflow.db.objects.static_definition.mind_flow_model import MindFlowModelID from mindflow.db.objects.static_definition.model import ModelID from mindflow.db.objects.static_definition.service import ServiceID +from mindflow.utils.errors import ModelError, EmbeddingModelError class Model(StaticObject): @@ -80,10 +81,10 @@ def __init__(self, model_id: str, mindflow_model_id: str): def openai_chat_completion( self, messages: list, - max_tokens: int = 500, temperature: float = 0.0, + max_tokens: Optional[int] = None, stop: Optional[list] = None, - ) -> Optional[str]: + ) -> Union[str, ModelError]: try: openai.api_key = self.api_key return openai.ChatCompletion.create( @@ -93,17 +94,17 @@ def openai_chat_completion( max_tokens=max_tokens, stop=stop, )["choices"][0]["message"]["content"] - except Exception as e: - return None + except ModelError as e: + return e - def openai_embedding(self, text: str) -> Optional[list]: + def openai_embedding(self, text: str) -> Union[np.ndarray, ModelError]: try: openai.api_key = self.api_key return openai.Embedding.create(engine=self.id, input=text)["data"][0][ "embedding" ] - except Exception as e: - return None + except ModelError as e: + return e def __call__(self, prompt, *args, **kwargs): if self.service == ServiceID.OPENAI.value: diff --git a/mindflow/utils/constants.py b/mindflow/utils/constants.py new file mode 100644 index 0000000..929efc3 --- /dev/null +++ b/mindflow/utils/constants.py @@ -0,0 +1,6 @@ +from enum import Enum + +class MinimumReservedLength(Enum): + CHAT = 500 + QUERY = 500 + DIFF = 600 diff --git a/mindflow/utils/errors.py b/mindflow/utils/errors.py index e69de29..ed6eba8 100644 --- a/mindflow/utils/errors.py +++ b/mindflow/utils/errors.py @@ -0,0 +1,46 @@ +GITHUB_ISSUE_MESSAGE = "If the problem persists, please raise an issue at: https://github.com/nollied/mindflow-cli/issues" +CONNECTION_MESSAGE = "Please check your internet connection and try again." + +class ModelError(Exception): + """Base class for all exceptions raised by this module.""" + def __init__(self, message): + self.message = message + super().__init__(self.message) + + @property + def base_message(self): + return f"Model API failed to return response for chat/query. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}." + + @property + def commit_message(self): + return f"Model API failed to return response for commit. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" + + @property + def diff_message(self): + return f"Model API failed to return response for diff. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" + + @property + def diff_partial_message(self): + return f"Warning: model API failed to return response for part of, or entire, diff. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" + + @property + def pr_message(self): + return f"Model API failed to return response for pr/mr. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" + + @property + def index_message(self): + return f"Warning: Model API failed to return response for a document chunk. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" + + @property + def query_message(self): + return f"Model API failed to return response for query. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" + + @property + def embedding_message(self): + return f"Warning: Model API failed to return response for embedding. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" + +class EmbeddingModelError(Exception): + """Base class for all exceptions raised by this module.""" + def __init__(self, message): + self.message = message + super().__init__(self.message) From 2c81725df4b4ff60880155e5e7c544e36330d869 Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 14:05:27 -0600 Subject: [PATCH 09/12] Cleaned up code formatting and fixed typos in Git-related modules and added newlines at end of files in utils. Co-authored-by: MindFlow --- mindflow/core/git/diff.py | 27 ++++++++++++++++++--------- mindflow/core/git/mr.py | 4 ++-- mindflow/core/git/pr.py | 16 ++++++++++------ mindflow/core/index.py | 1 - mindflow/core/query.py | 20 ++++++++++++-------- mindflow/utils/constants.py | 1 + mindflow/utils/errors.py | 16 ++++++++++------ 7 files changed, 53 insertions(+), 32 deletions(-) diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py index d401fc4..2e59faf 100644 --- a/mindflow/core/git/diff.py +++ b/mindflow/core/git/diff.py @@ -37,9 +37,7 @@ def run_diff(args: Tuple[str]) -> str: if len(diff_dict) <= 0: return "No staged changes." - batched_parsed_diff_result = batch_git_diffs( - diff_dict, completion_model - ) + batched_parsed_diff_result = batch_git_diffs(diff_dict, completion_model) response: str = "" if len(batched_parsed_diff_result) == 1: @@ -70,9 +68,8 @@ def run_diff(args: Tuple[str]) -> str: if isinstance(partial_response, ModelError): print(partial_response.diff_partial_message) continue - - response += partial_response + response += partial_response if len(excluded_filenames) > 0: response += f"\n\nNOTE: The following files were excluded from the diff: {', '.join(excluded_filenames)}" @@ -90,13 +87,19 @@ def batch_git_diffs( current_batch: List = [] current_batch_text = "" for file_name, diff_content in file_diffs.items(): - if get_token_count(model, diff_content) > model.hard_token_limit - MinimumReservedLength.DIFF.value: + if ( + get_token_count(model, diff_content) + > model.hard_token_limit - MinimumReservedLength.DIFF.value + ): ## Split the diff into chunks that are less than the token limit chunks = [diff_content] while True: new_chunks = [] for chunk in chunks: - if get_token_count(model, chunk) > model.hard_token_limit - MinimumReservedLength.DIFF.value: + if ( + get_token_count(model, chunk) + > model.hard_token_limit - MinimumReservedLength.DIFF.value + ): half_len = len(chunk) // 2 left_half = chunk[:half_len] right_half = chunk[half_len:] @@ -109,14 +112,20 @@ def batch_git_diffs( ## Add the chunks to the batch or multiple batches for chunk in chunks: - if get_token_count(model, current_batch_text+chunk) > model.hard_token_limit - MinimumReservedLength.DIFF.value: + if ( + get_token_count(model, current_batch_text + chunk) + > model.hard_token_limit - MinimumReservedLength.DIFF.value + ): batches.append(current_batch) current_batch = [] current_batch_text = "" current_batch.append((file_name, chunk)) current_batch_text += chunk - elif get_token_count(model, current_batch_text+diff_content) > model.hard_token_limit - MinimumReservedLength.DIFF.value: + elif ( + get_token_count(model, current_batch_text + diff_content) + > model.hard_token_limit - MinimumReservedLength.DIFF.value + ): batches.append(current_batch) current_batch = [(file_name, diff_content)] current_batch_text = diff_content diff --git a/mindflow/core/git/mr.py b/mindflow/core/git/mr.py index fc1cbf3..bead0cd 100644 --- a/mindflow/core/git/mr.py +++ b/mindflow/core/git/mr.py @@ -33,10 +33,10 @@ def run_mr( if not title or not description: tital_description_tuple = create_title_and_body(base_branch, title, description) - + if not tital_description_tuple: return - + title, description = tital_description_tuple create_merge_request(args, title, description) diff --git a/mindflow/core/git/pr.py b/mindflow/core/git/pr.py index 54b6707..5866b04 100644 --- a/mindflow/core/git/pr.py +++ b/mindflow/core/git/pr.py @@ -37,10 +37,10 @@ def run_pr(args: Tuple[str], title: Optional[str] = None, body: Optional[str] = if not title or not body: tital_body_tuple = create_title_and_body(base_branch, title, body) - + if not tital_body_tuple: return - + title, body = tital_body_tuple create_pull_request(args, title, body) @@ -85,18 +85,22 @@ def create_title_and_body( else: if title_response is None: pr_title_prompt = build_context_prompt(PR_TITLE_PREFIX, diff_output) - title_response: Union[ModelError, str] = settings.mindflow_models.query.model(pr_title_prompt) + title_response: Union[ + ModelError, str + ] = settings.mindflow_models.query.model(pr_title_prompt) if body is None: pr_body_prompt = build_context_prompt(PR_BODY_PREFIX, diff_output) - body_response: Union[ModelError, str] = settings.mindflow_models.query.model(pr_body_prompt) - + body_response: Union[ + ModelError, str + ] = settings.mindflow_models.query.model(pr_body_prompt) + if isinstance(title_response, ModelError): print(title_response.pr_message) return None if isinstance(body_response, ModelError): print(body_response.pr_message) return None - + title = title_response if title is None else title body = body_response if body is None else body return title, body diff --git a/mindflow/core/index.py b/mindflow/core/index.py index 5f645ea..2eacb44 100644 --- a/mindflow/core/index.py +++ b/mindflow/core/index.py @@ -108,7 +108,6 @@ def __init__( else: self.summary = response - def set_leaves(self, leaves: List["Node"]) -> None: self.leaves = leaves diff --git a/mindflow/core/query.py b/mindflow/core/query.py index 2b8e6ef..56ec2b6 100644 --- a/mindflow/core/query.py +++ b/mindflow/core/query.py @@ -43,7 +43,7 @@ def run_query(document_paths: List[str], query: str): response: Union[ModelError, str] = completion_model(messages) if isinstance(response, ModelError): return response.query_message - + return response @@ -79,7 +79,9 @@ def select_content( ) sys.exit(1) - selected_content = trim_content(embedding_ranked_document_chunks, completion_model, query) + selected_content = trim_content( + embedding_ranked_document_chunks, completion_model, query + ) return selected_content @@ -115,14 +117,14 @@ def from_search_tree( document.search_tree["end"], ) ] - embedding_response: Union[ModelError, np.ndarray] = embedding_model(document.search_tree["summary"]) + embedding_response: Union[ModelError, np.ndarray] = embedding_model( + document.search_tree["summary"] + ) if isinstance(embedding_response, ModelError): print(embedding_response.embedding_message) return [], [] - - embeddings: List[np.ndarray] = [ - embedding_response - ] + + embeddings: List[np.ndarray] = [embedding_response] rolling_summary: List[str] = [] while stack: node = stack.pop() @@ -131,7 +133,9 @@ def from_search_tree( for leaf in node["leaves"]: stack.append(leaf) chunks.append(cls(document.path, leaf["start"], leaf["end"])) - rolling_summary_embedding_response: Union[np.ndarray, ModelError] = embedding_model( + rolling_summary_embedding_response: Union[ + np.ndarray, ModelError + ] = embedding_model( "\n\n".join(rolling_summary) + "\n\n" + leaf["summary"], ) if isinstance(rolling_summary_embedding_response, ModelError): diff --git a/mindflow/utils/constants.py b/mindflow/utils/constants.py index 929efc3..21791b8 100644 --- a/mindflow/utils/constants.py +++ b/mindflow/utils/constants.py @@ -1,5 +1,6 @@ from enum import Enum + class MinimumReservedLength(Enum): CHAT = 500 QUERY = 500 diff --git a/mindflow/utils/errors.py b/mindflow/utils/errors.py index ed6eba8..5918d2f 100644 --- a/mindflow/utils/errors.py +++ b/mindflow/utils/errors.py @@ -1,12 +1,14 @@ GITHUB_ISSUE_MESSAGE = "If the problem persists, please raise an issue at: https://github.com/nollied/mindflow-cli/issues" CONNECTION_MESSAGE = "Please check your internet connection and try again." + class ModelError(Exception): """Base class for all exceptions raised by this module.""" + def __init__(self, message): self.message = message super().__init__(self.message) - + @property def base_message(self): return f"Model API failed to return response for chat/query. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}." @@ -19,28 +21,30 @@ def commit_message(self): def diff_message(self): return f"Model API failed to return response for diff. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" - @property + @property def diff_partial_message(self): return f"Warning: model API failed to return response for part of, or entire, diff. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" - + @property def pr_message(self): return f"Model API failed to return response for pr/mr. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" - + @property def index_message(self): return f"Warning: Model API failed to return response for a document chunk. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" - + @property def query_message(self): return f"Model API failed to return response for query. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" - + @property def embedding_message(self): return f"Warning: Model API failed to return response for embedding. {CONNECTION_MESSAGE}. {GITHUB_ISSUE_MESSAGE}" + class EmbeddingModelError(Exception): """Base class for all exceptions raised by this module.""" + def __init__(self, message): self.message = message super().__init__(self.message) From 63427bbbfbec168f8a2797873b31ea4660a8e60c Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 14:12:53 -0600 Subject: [PATCH 10/12] Refactored variable names and removed unnecessary parameters and attributes. Co-authored-by: MindFlow --- mindflow/core/git/diff.py | 22 +++++++++++----------- mindflow/core/git/pr.py | 2 +- mindflow/db/objects/mindflow_model.py | 2 +- mindflow/db/objects/model.py | 5 +---- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py index 2e59faf..4ddca4e 100644 --- a/mindflow/core/git/diff.py +++ b/mindflow/core/git/diff.py @@ -39,16 +39,17 @@ def run_diff(args: Tuple[str]) -> str: batched_parsed_diff_result = batch_git_diffs(diff_dict, completion_model) - response: str = "" + diff_summary: str = "" if len(batched_parsed_diff_result) == 1: content = "" for file_name, diff_content in batched_parsed_diff_result[0]: content += f"*{file_name}*\n DIFF CONTENT: {diff_content}\n\n" - response: Union[ModelError, str] = completion_model( + diff_response: Union[ModelError, str] = completion_model( build_context_prompt(GIT_DIFF_PROMPT_PREFIX, content) ) - if isinstance(response, ModelError): - print(response.diff_message) + if isinstance(diff_response, ModelError): + return diff_response.diff_message + diff_summary += diff_response else: with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] @@ -64,17 +65,16 @@ def run_diff(args: Tuple[str]) -> str: # Process the results as they become available for future in concurrent.futures.as_completed(futures): - partial_response: Union[ModelError, str] = future.result() - if isinstance(partial_response, ModelError): - print(partial_response.diff_partial_message) - continue + diff_partial_response: Union[ModelError, str] = future.result() + if isinstance(diff_partial_response, ModelError): + return diff_partial_response.diff_partial_message - response += partial_response + diff_summary += diff_partial_response if len(excluded_filenames) > 0: - response += f"\n\nNOTE: The following files were excluded from the diff: {', '.join(excluded_filenames)}" + diff_summary += f"\n\nNOTE: The following files were excluded from the diff: {', '.join(excluded_filenames)}" - return response + return diff_summary import re diff --git a/mindflow/core/git/pr.py b/mindflow/core/git/pr.py index 5866b04..ab54186 100644 --- a/mindflow/core/git/pr.py +++ b/mindflow/core/git/pr.py @@ -83,7 +83,7 @@ def create_title_and_body( title_response: Union[ModelError, str] = future_title.result() body_response: Union[ModelError, str] = future_body.result() else: - if title_response is None: + if title is None: pr_title_prompt = build_context_prompt(PR_TITLE_PREFIX, diff_output) title_response: Union[ ModelError, str diff --git a/mindflow/db/objects/mindflow_model.py b/mindflow/db/objects/mindflow_model.py index 3c68449..77b2e5b 100644 --- a/mindflow/db/objects/mindflow_model.py +++ b/mindflow/db/objects/mindflow_model.py @@ -57,7 +57,7 @@ def __init__(self, mindflow_model_id: str, configured_services: ConfiguredServic if model_id is None: model_id = self.get_default_model_id(mindflow_model_id, configured_services) - self.model = ConfiguredModel(model_id, mindflow_model_id) + self.model = ConfiguredModel(model_id) def get_default_model_id( self, mindflow_model_id: str, configured_services: ConfiguredServices diff --git a/mindflow/db/objects/model.py b/mindflow/db/objects/model.py index c127110..f946698 100644 --- a/mindflow/db/objects/model.py +++ b/mindflow/db/objects/model.py @@ -56,10 +56,7 @@ class ConfiguredModel(Callable): soft_token_limit: int api_key: str - # MindFlow Model That instantiated this model - _mindflow_model_id: str = None - - def __init__(self, model_id: str, mindflow_model_id: str): + def __init__(self, model_id: str): model = Model.load(model_id) model_config = ModelConfig.load(f"{model_id}_config") From 192721b32567c986281afba0a9f7405397350f29 Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 14:15:54 -0600 Subject: [PATCH 11/12] Refactor pr.py to simplify code and add type hints. Co-authored-by: MindFlow --- mindflow/core/git/pr.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mindflow/core/git/pr.py b/mindflow/core/git/pr.py index ab54186..0d7ff41 100644 --- a/mindflow/core/git/pr.py +++ b/mindflow/core/git/pr.py @@ -68,6 +68,8 @@ def create_title_and_body( diff_output = run_diff((base_branch,)) + title_response: Union[ModelError, str] + body_response: Union[ModelError, str] if title is None and body is None: pr_title_prompt = build_context_prompt(PR_TITLE_PREFIX, diff_output) pr_body_prompt = build_context_prompt(PR_BODY_PREFIX, diff_output) @@ -80,19 +82,15 @@ def create_title_and_body( settings.mindflow_models.query.model, pr_body_prompt ) - title_response: Union[ModelError, str] = future_title.result() - body_response: Union[ModelError, str] = future_body.result() + title_response = future_title.result() + body_response = future_body.result() else: if title is None: pr_title_prompt = build_context_prompt(PR_TITLE_PREFIX, diff_output) - title_response: Union[ - ModelError, str - ] = settings.mindflow_models.query.model(pr_title_prompt) + title_response = settings.mindflow_models.query.model(pr_title_prompt) if body is None: pr_body_prompt = build_context_prompt(PR_BODY_PREFIX, diff_output) - body_response: Union[ - ModelError, str - ] = settings.mindflow_models.query.model(pr_body_prompt) + body_response = settings.mindflow_models.query.model(pr_body_prompt) if isinstance(title_response, ModelError): print(title_response.pr_message) @@ -101,8 +99,8 @@ def create_title_and_body( print(body_response.pr_message) return None - title = title_response if title is None else title - body = body_response if body is None else body + title = title if title is not None else title_response + body = body if body is not None else body_response return title, body From aa1c3c3392ed7310a7d3a7f21340ff4f7a27c25e Mon Sep 17 00:00:00 2001 From: Chris Steege Date: Wed, 8 Mar 2023 17:57:04 -0600 Subject: [PATCH 12/12] Added try-except blocks to handle tiktoken import and modified token count functions to handle import failure. Also updated tiktoken requirement to only install for python version >= 3.8. Co-authored-by: MindFlow --- mindflow/db/objects/model.py | 26 +++++++++++++++++++++----- mindflow/utils/token.py | 10 ++++++++-- requirements.txt | 2 +- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/mindflow/db/objects/model.py b/mindflow/db/objects/model.py index f946698..926c6c9 100644 --- a/mindflow/db/objects/model.py +++ b/mindflow/db/objects/model.py @@ -4,7 +4,15 @@ import numpy as np from traitlets import Callable -import tiktoken +try: + import tiktoken +except ImportError: + print( + "tiktoken not not available in python<=v3.8. Estimation of tokens will be less precise, which may impact performance and quality of responses." + ) + print("Upgrade to python v3.8 or higher for better results.") + pass + from mindflow.db.db.database import Collection from mindflow.db.objects.base import BaseObject @@ -12,7 +20,7 @@ from mindflow.db.objects.service import ServiceConfig from mindflow.db.objects.static_definition.model import ModelID from mindflow.db.objects.static_definition.service import ServiceID -from mindflow.utils.errors import ModelError, EmbeddingModelError +from mindflow.utils.errors import ModelError class Model(StaticObject): @@ -47,7 +55,12 @@ class ConfiguredModel(Callable): name: str service: str model_type: str - tokenizer: tiktoken.Encoding + + try: + tokenizer: tiktoken.Encoding + except NameError: + pass + hard_token_limit: int token_cost: int token_cost_unit: str @@ -69,8 +82,11 @@ def __init__(self, model_id: str): if value not in [None, ""]: setattr(self, key, value) - if self.service == ServiceID.OPENAI.value: - self.tokenizer = tiktoken.encoding_for_model(self.id) + try: + if self.service == ServiceID.OPENAI.value: + self.tokenizer = tiktoken.encoding_for_model(self.id) + except NameError: + pass service_config = ServiceConfig.load(f"{self.service}_config") self.api_key = service_config.api_key diff --git a/mindflow/utils/token.py b/mindflow/utils/token.py index 9c0aeb2..a7e27d1 100644 --- a/mindflow/utils/token.py +++ b/mindflow/utils/token.py @@ -6,11 +6,17 @@ def get_token_count(model: ConfiguredModel, text: str) -> int: """ This function is used to get the token count of a string. """ - return len(model.tokenizer.encode(text)) + try: + return len(model.tokenizer.encode(text)) + except Exception: + return len(text) // 3 def get_batch_token_count(model: ConfiguredModel, texts: List[str]) -> int: """ This function is used to get the token count of a list of strings. """ - return sum([len(encoding) for encoding in model.tokenizer.encode_batch(texts)]) + try: + return sum([len(encoding) for encoding in model.tokenizer.encode_batch(texts)]) + except Exception: + return sum([len(text) // 3 for text in texts]) diff --git a/requirements.txt b/requirements.txt index a9c97e6..16befaa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ alive-progress click numpy openai==0.27.0 -tiktoken==0.3.0 +tiktoken==0.3.0; python_version >= '3.8' pytest scikit-learn tqdm