Merge pull request #95 from nollied/cleanup

Added new modules and methods, updated version number, and added new dependencies and imports.
mindflowai · Mar 9, 2023 · db2a0c3 · db2a0c3
2 parents 25d77e0 + aa1c3c3
commit db2a0c3
Show file tree

Hide file tree

Showing 24 changed files with 315 additions and 111 deletions.
diff --git a/README.md b/README.md
@@ -78,8 +78,8 @@ Make some changes to your git repo and stage them. Then, run `mf commit`! You sh
  1 file changed, 14 insertions(+)
 ```
 
-### Create PRs With GPT Titles And Body
-Make some changes to your branch and stage, and then commit them. Then, run `mf pr`! A PR should be created with a title and body generated by GPT, and a link to the PR should be printed to the console.
+### Create PRs/MRs With GPT Titles And Body
+Make some changes to your branch and stage, and then commit them. Then, run `mf pr` for GitHub or `mf mr` for GitLab! A pull request/merge request should be created with a title and body generated by GPT, and a link to the PR should be printed to the console.
 - To use this feature, you must first install and authenticate the [GitHub CLI](https://cli.github.com/).
 
 ## How does it work?

diff --git a/mindflow/__init__.py b/mindflow/__init__.py
@@ -1 +1 @@
-__version__ = "0.3.12"
+__version__ = "0.3.13"
diff --git a/mindflow/core/chat.py b/mindflow/core/chat.py
@@ -1,19 +1,35 @@
+from typing import Optional
 from mindflow.settings import Settings
+from mindflow.utils.constants import MinimumReservedLength
+from mindflow.utils.prompts import CHAT_PROMPT_PREFIX
+from mindflow.utils.token import get_token_count
 
 
 def run_chat(prompt: str) -> str:
     """
     This function is used to generate a prompt and then use it as a prompt for GPT bot.
     """
     settings = Settings()
+    completion_model = settings.mindflow_models.query.model
+
+    if (
+        get_token_count(completion_model, CHAT_PROMPT_PREFIX + prompt)
+        > completion_model.hard_token_limit - MinimumReservedLength.CHAT.value
+    ):
+        print("The prompt is too long. Please try again with a shorter prompt.")
+        return ""
+
     # Prompt GPT through Mindflow API or locally
-    response: str = settings.mindflow_models.query.model(
+    response: Optional[str] = completion_model(
         [
             {
                 "role": "system",
-                "content": "You are a helpful virtual assistant responding to a users query using your general knowledge and the text provided below.",
+                "content": CHAT_PROMPT_PREFIX,
             },
             {"role": "user", "content": prompt},
         ]
     )
+
+    if response is None:
+        return "Unable to generate response. Please try again. If the problem persists, please raise an issue at: https://github.com/nollied/mindflow-cli/issues."
     return response
diff --git a/mindflow/core/git/commit.py b/mindflow/core/git/commit.py
@@ -1,8 +1,9 @@
 import subprocess
-from typing import Tuple, Optional
+from typing import Tuple, Optional, Union
 
 from mindflow.core.git.diff import run_diff
 from mindflow.settings import Settings
+from mindflow.utils.errors import ModelError
 from mindflow.utils.prompt_builders import build_context_prompt
 from mindflow.utils.prompts import COMMIT_PROMPT_PREFIX
 
@@ -20,9 +21,11 @@ def run_commit(args: Tuple[str], message_overwrite: Optional[str] = None) -> str
         if diff_output == "No staged changes.":
             return diff_output
 
-        response: str = settings.mindflow_models.query.model(
+        response: Union[ModelError, str] = settings.mindflow_models.query.model(
             build_context_prompt(COMMIT_PROMPT_PREFIX, diff_output)
         )
+        if isinstance(response, ModelError):
+            return response.commit_message
 
         # add co-authorship to commit message
         response += "\n\nCo-authored-by: MindFlow <[email protected]>"

diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py
@@ -3,17 +3,19 @@
 """
 import concurrent.futures
 import subprocess
-from typing import Dict
+from typing import Dict, Union
 from typing import List
-from typing import Optional
 from typing import Tuple
 
 from mindflow.db.objects.model import ConfiguredModel
 from mindflow.settings import Settings
+from mindflow.utils.constants import MinimumReservedLength
+from mindflow.utils.errors import ModelError
 from mindflow.utils.prompt_builders import build_context_prompt
 from mindflow.utils.prompts import GIT_DIFF_PROMPT_PREFIX
 
-from mindflow.utils.diff_parser import parse_git_diff, IGNORE_FILE_EXTENSIONS
+from mindflow.utils.diff_parser import parse_git_diff
+from mindflow.utils.token import get_token_count
 
 
 def run_diff(args: Tuple[str]) -> str:
@@ -35,18 +37,19 @@ def run_diff(args: Tuple[str]) -> str:
     if len(diff_dict) <= 0:
         return "No staged changes."
 
-    batched_parsed_diff_result = batch_git_diffs(
-        diff_dict, token_limit=completion_model.hard_token_limit
-    )
+    batched_parsed_diff_result = batch_git_diffs(diff_dict, completion_model)
 
-    response: str = ""
+    diff_summary: str = ""
     if len(batched_parsed_diff_result) == 1:
         content = ""
         for file_name, diff_content in batched_parsed_diff_result[0]:
             content += f"*{file_name}*\n DIFF CONTENT: {diff_content}\n\n"
-        response = completion_model(
+        diff_response: Union[ModelError, str] = completion_model(
             build_context_prompt(GIT_DIFF_PROMPT_PREFIX, content)
         )
+        if isinstance(diff_response, ModelError):
+            return diff_response.diff_message
+        diff_summary += diff_response
     else:
         with concurrent.futures.ThreadPoolExecutor() as executor:
             futures = []
@@ -62,43 +65,73 @@ def run_diff(args: Tuple[str]) -> str:
 
             # Process the results as they become available
             for future in concurrent.futures.as_completed(futures):
-                response += future.result()
+                diff_partial_response: Union[ModelError, str] = future.result()
+                if isinstance(diff_partial_response, ModelError):
+                    return diff_partial_response.diff_partial_message
+
+                diff_summary += diff_partial_response
 
     if len(excluded_filenames) > 0:
-        response += f"\n\nNOTE: The following files were excluded from the diff: {', '.join(excluded_filenames)}"
+        diff_summary += f"\n\nNOTE: The following files were excluded from the diff: {', '.join(excluded_filenames)}"
 
-    return response
+    return diff_summary
 
 
 import re
 
 
 def batch_git_diffs(
-    file_diffs: Dict[str, str], token_limit: int
+    file_diffs: Dict[str, str], model: ConfiguredModel
 ) -> List[List[Tuple[str, str]]]:
     batches = []
     current_batch: List = []
-    current_batch_size = 0
+    current_batch_text = ""
     for file_name, diff_content in file_diffs.items():
-        if len(diff_content) > token_limit:
-            chunks = [
-                diff_content[i : i + token_limit]
-                for i in range(0, len(diff_content), token_limit)
-            ]
+        if (
+            get_token_count(model, diff_content)
+            > model.hard_token_limit - MinimumReservedLength.DIFF.value
+        ):
+            ## Split the diff into chunks that are less than the token limit
+            chunks = [diff_content]
+            while True:
+                new_chunks = []
+                for chunk in chunks:
+                    if (
+                        get_token_count(model, chunk)
+                        > model.hard_token_limit - MinimumReservedLength.DIFF.value
+                    ):
+                        half_len = len(chunk) // 2
+                        left_half = chunk[:half_len]
+                        right_half = chunk[half_len:]
+                        new_chunks.extend([left_half, right_half])
+                    else:
+                        new_chunks.append(chunk)
+                if new_chunks == chunks:
+                    break
+                chunks = new_chunks
+
+            ## Add the chunks to the batch or multiple batches
             for chunk in chunks:
-                if current_batch_size + len(chunk) > token_limit * 2:
+                if (
+                    get_token_count(model, current_batch_text + chunk)
+                    > model.hard_token_limit - MinimumReservedLength.DIFF.value
+                ):
                     batches.append(current_batch)
                     current_batch = []
-                    current_batch_size = 0
+                    current_batch_text = ""
                 current_batch.append((file_name, chunk))
-                current_batch_size += len(chunk)
-        elif current_batch_size + len(diff_content) > token_limit * 2:
+                current_batch_text += chunk
+
+        elif (
+            get_token_count(model, current_batch_text + diff_content)
+            > model.hard_token_limit - MinimumReservedLength.DIFF.value
+        ):
             batches.append(current_batch)
             current_batch = [(file_name, diff_content)]
-            current_batch_size = len(diff_content)
+            current_batch_text = diff_content
         else:
             current_batch.append((file_name, diff_content))
-            current_batch_size += len(diff_content)
+            current_batch_text += diff_content
     if current_batch:
         batches.append(current_batch)
     return batches
diff --git a/mindflow/core/git/mr.py b/mindflow/core/git/mr.py
@@ -32,7 +32,12 @@ def run_mr(
         return
 
     if not title or not description:
-        title, description = create_title_and_body(base_branch, title, description)
+        tital_description_tuple = create_title_and_body(base_branch, title, description)
+
+    if not tital_description_tuple:
+        return
+
+    title, description = tital_description_tuple
 
     create_merge_request(args, title, description)
 

diff --git a/mindflow/core/git/pr.py b/mindflow/core/git/pr.py
@@ -1,10 +1,11 @@
 import concurrent.futures
 import subprocess
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 from mindflow.core.git.diff import run_diff
 from mindflow.settings import Settings
 from mindflow.utils.command_parse import get_flag_value
+from mindflow.utils.errors import ModelError
 from mindflow.utils.prompt_builders import build_context_prompt
 from mindflow.utils.prompts import PR_BODY_PREFIX
 from mindflow.utils.prompts import PR_TITLE_PREFIX
@@ -35,8 +36,12 @@ def run_pr(args: Tuple[str], title: Optional[str] = None, body: Optional[str] =
         return
 
     if not title or not body:
-        title, body = create_title_and_body(base_branch, title, body)
+        tital_body_tuple = create_title_and_body(base_branch, title, body)
 
+    if not tital_body_tuple:
+        return
+
+    title, body = tital_body_tuple
     create_pull_request(args, title, body)
 
 
@@ -58,11 +63,13 @@ def is_valid_pr(head_branch: str, base_branch: str) -> bool:
 
 def create_title_and_body(
     base_branch, title: Optional[str], body: Optional[str]
-) -> Tuple[str, str]:
+) -> Optional[Tuple[str, str]]:
     settings = Settings()
 
     diff_output = run_diff((base_branch,))
 
+    title_response: Union[ModelError, str]
+    body_response: Union[ModelError, str]
     if title is None and body is None:
         pr_title_prompt = build_context_prompt(PR_TITLE_PREFIX, diff_output)
         pr_body_prompt = build_context_prompt(PR_BODY_PREFIX, diff_output)
@@ -75,16 +82,25 @@ def create_title_and_body(
                 settings.mindflow_models.query.model, pr_body_prompt
             )
 
-        title = future_title.result()
-        body = future_body.result()
+        title_response = future_title.result()
+        body_response = future_body.result()
     else:
         if title is None:
             pr_title_prompt = build_context_prompt(PR_TITLE_PREFIX, diff_output)
-            title = settings.mindflow_models.query.model(pr_title_prompt)
+            title_response = settings.mindflow_models.query.model(pr_title_prompt)
         if body is None:
             pr_body_prompt = build_context_prompt(PR_BODY_PREFIX, diff_output)
-            body = settings.mindflow_models.query.model(pr_body_prompt)
+            body_response = settings.mindflow_models.query.model(pr_body_prompt)
+
+    if isinstance(title_response, ModelError):
+        print(title_response.pr_message)
+        return None
+    if isinstance(body_response, ModelError):
+        print(body_response.pr_message)
+        return None
 
+    title = title if title is not None else title_response
+    body = body if body is not None else body_response
     return title, body
 
 

diff --git a/mindflow/core/index.py b/mindflow/core/index.py
@@ -1,10 +1,10 @@
 """
 `generate` command
 """
-from asyncio import Future
 from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
-from typing import List
+import logging
+from typing import List, Union
 from typing import Optional
 
 import numpy as np
@@ -18,8 +18,10 @@
 from mindflow.resolving.resolve import resolve_all
 from mindflow.resolving.resolve import return_if_indexable
 from mindflow.settings import Settings
+from mindflow.utils.errors import ModelError
 from mindflow.utils.prompt_builders import build_context_prompt
 from mindflow.utils.prompts import INDEX_PROMPT_PREFIX
+from mindflow.utils.token import get_batch_token_count, get_token_count
 
 
 def run_index(document_paths: List[str], refresh: bool, force: bool) -> None:
@@ -97,9 +99,14 @@ def __init__(
         self.start = start
         self.end = end
         if text:
-            self.summary = completion_model(
+            response: Union[str, ModelError] = completion_model(
                 build_context_prompt(INDEX_PROMPT_PREFIX, text)
             )
+            if isinstance(response, ModelError):
+                self.summary = ""
+                print(response.index_message)
+            else:
+                self.summary = response
 
     def set_leaves(self, leaves: List["Node"]) -> None:
         self.leaves = leaves
@@ -136,15 +143,6 @@ def iterative_to_dict(self) -> dict:
         return node_dict
 
 
-def count_tokens(text: str) -> int:
-    """
-    Counts/estimates the number of tokens this text will consume by GPT.
-    """
-    # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-    # count = len(tokenizer(text)['input_ids'])
-    return len(text) // 4  # Token Estimation for speed
-
-
 # This function is used to split a string into chunks of a specified token limit using binary search
 def binary_split_raw_text_to_nodes(
     completion_model: ConfiguredModel, text: str
@@ -156,7 +154,10 @@ def binary_split_raw_text_to_nodes(
     stack = [(0, len(text))]
     while stack:
         start, end = stack.pop()
-        if count_tokens(text[start:end]) < completion_model.soft_token_limit:
+        if (
+            get_token_count(completion_model, text[start:end])
+            < completion_model.soft_token_limit
+        ):
             nodes.append(Node(completion_model, start, end, text[start:end]))
         else:
             mid = ((end - start) // 2) + start
@@ -176,7 +177,9 @@ def binary_split_nodes_to_chunks(
     while stack:
         nodes, start, end = stack.pop()
         if (
-            sum(count_tokens(node.summary) for node in nodes[start:end])
+            get_batch_token_count(
+                completion_model, [node.summary for node in nodes[start:end]]
+            )
             < completion_model.soft_token_limit
         ):
             chunks.append(nodes[start:end])
@@ -195,7 +198,10 @@ def create_nodes(completion_model: ConfiguredModel, leaf_nodes: List[Node]) -> N
     while stack:
         leaf_nodes, start, end = stack.pop()
         if (
-            sum(count_tokens(leaf_node.summary) for leaf_node in leaf_nodes[start:end])
+            get_batch_token_count(
+                completion_model,
+                [leaf_node.summary for leaf_node in leaf_nodes[start:end]],
+            )
             > completion_model.soft_token_limit
         ):
             node_chunks: List[List[Node]] = binary_split_nodes_to_chunks(
@@ -222,7 +228,7 @@ def create_text_search_tree(completion_model: ConfiguredModel, text: str) -> dic
     """
     This function is used to create a tree of responses from the OpenAI API
     """
-    if count_tokens(text) < completion_model.soft_token_limit:
+    if get_token_count(completion_model, text) < completion_model.soft_token_limit:
         return Node(completion_model, 0, len(text), text).to_dict()
 
     return create_nodes(