From ca234a38f500610dff324d6438b679c8bc05fbbd Mon Sep 17 00:00:00 2001 From: nollied Date: Wed, 8 Mar 2023 00:17:34 +0000 Subject: [PATCH 1/3] Added diff_parser.py to mindflow/utils directory with functions to parse git diffs. Co-authored-by: MindFlow --- mindflow/test.ipynb | 52 +++++++++++++++++ mindflow/unit_tests/dummy_diff.txt | 94 ++++++++++++++++++++++++++++++ mindflow/unit_tests/test_utils.py | 12 ++++ mindflow/utils/diff_parser.py | 39 +++++++++++++ 4 files changed, 197 insertions(+) create mode 100644 mindflow/test.ipynb create mode 100644 mindflow/unit_tests/dummy_diff.txt create mode 100644 mindflow/unit_tests/test_utils.py create mode 100644 mindflow/utils/diff_parser.py diff --git a/mindflow/test.ipynb b/mindflow/test.ipynb new file mode 100644 index 0000000..19d8f22 --- /dev/null +++ b/mindflow/test.ipynb @@ -0,0 +1,52 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "asdfasdf\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "asdfasdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mindflow/unit_tests/dummy_diff.txt b/mindflow/unit_tests/dummy_diff.txt new file mode 100644 index 0000000..e3afdd6 --- /dev/null +++ b/mindflow/unit_tests/dummy_diff.txt @@ -0,0 +1,94 @@ +diff --git a/diff.txt b/diff.txt +index de79139..e69de29 100644 +--- a/diff.txt ++++ b/diff.txt +@@ -1,38 +0,0 @@ +-diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py +-new file mode 100644 +-index 0000000..33a588f +---- /dev/null +-+++ b/mindflow/utils/diff_parser.py +-@@ -0,0 +1,31 @@ +-+ +-+ +-+ +-+def parse_git_diff_file(diff_file): +-+ diffs = {} +-+ current_file = None +-+ current_diff = [] +-+ +-+ with open(diff_file, "r") as f: +-+ for line in f: +-+ if line.startswith("diff --git"): +-+ # Starting a new file +-+ if current_file: +-+ # Add the previous diff to the dictionary +-+ diffs[current_file] = "".join(current_diff) +-+ current_file = line.split()[-1] +-+ current_diff = [line] +-+ else: +-+ current_diff.append(line) +-+ +-+ # Add the last diff to the dictionary +-+ if current_file: +-+ diffs[current_file] = "".join(current_diff) +-+ +-+ return diffs +-+ +-+ +-+diffs = parse_git_diff_file("diff.txt") +-+for filename, diff in diffs.items(): +-+ print(f"Diff for {filename}:") +-+ print(diff) +-\ No newline at end of file +diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py +index 33a588f..bfb9b92 100644 +--- a/mindflow/utils/diff_parser.py ++++ b/mindflow/utils/diff_parser.py +@@ -1,4 +1,8 @@ + ++import os ++ ++# NOTE: make sure to have a the "." in the file extension (if applicable) ++IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"] + + + def parse_git_diff_file(diff_file): +@@ -13,10 +17,21 @@ def parse_git_diff_file(diff_file): + if current_file: + # Add the previous diff to the dictionary + diffs[current_file] = "".join(current_diff) ++ + current_file = line.split()[-1] ++ current_ext = os.path.splitext(current_file)[1] ++ ++ if current_ext in IGNORE_FILE_EXTENSIONS: ++ # Ignore this file ++ current_file = None ++ current_diff = [] ++ continue ++ + current_diff = [line] + else: +- current_diff.append(line) ++ # skip lines if we are ignoring this file (TODO - this is a bit hacky) ++ if current_file: ++ current_diff.append(line) + + # Add the last diff to the dictionary + if current_file: +@@ -25,7 +40,9 @@ def parse_git_diff_file(diff_file): + return diffs + + ++ + diffs = parse_git_diff_file("diff.txt") +-for filename, diff in diffs.items(): +- print(f"Diff for {filename}:") +- print(diff) +\ No newline at end of file ++# for filename, diff in diffs.items(): ++# print(f"Diff for {filename}:") ++# print(diff) ++print(list(diffs.keys())) +\ No newline at end of file diff --git a/mindflow/unit_tests/test_utils.py b/mindflow/unit_tests/test_utils.py new file mode 100644 index 0000000..cacf274 --- /dev/null +++ b/mindflow/unit_tests/test_utils.py @@ -0,0 +1,12 @@ +from mindflow.utils.diff_parser import parse_git_diff_file + + +def test_diff_parser(): + diffs = parse_git_diff_file("mindflow/unit_tests/dummy_diff.txt") + + expected = { + "b/diff.txt": 'diff --git a/diff.txt b/diff.txt\nindex de79139..e69de29 100644\n--- a/diff.txt\n+++ b/diff.txt\n@@ -1,38 +0,0 @@\n-diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\n-new file mode 100644\n-index 0000000..33a588f\n---- /dev/null\n-+++ b/mindflow/utils/diff_parser.py\n-@@ -0,0 +1,31 @@\n-+\n-+\n-+\n-+def parse_git_diff_file(diff_file):\n-+ diffs = {}\n-+ current_file = None\n-+ current_diff = []\n-+\n-+ with open(diff_file, "r") as f:\n-+ for line in f:\n-+ if line.startswith("diff --git"):\n-+ # Starting a new file\n-+ if current_file:\n-+ # Add the previous diff to the dictionary\n-+ diffs[current_file] = "".join(current_diff)\n-+ current_file = line.split()[-1]\n-+ current_diff = [line]\n-+ else:\n-+ current_diff.append(line)\n-+\n-+ # Add the last diff to the dictionary\n-+ if current_file:\n-+ diffs[current_file] = "".join(current_diff)\n-+\n-+ return diffs\n-+\n-+\n-+diffs = parse_git_diff_file("diff.txt")\n-+for filename, diff in diffs.items():\n-+ print(f"Diff for {filename}:")\n-+ print(diff)\n-\\ No newline at end of file\n', + "b/mindflow/utils/diff_parser.py": 'diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\nindex 33a588f..bfb9b92 100644\n--- a/mindflow/utils/diff_parser.py\n+++ b/mindflow/utils/diff_parser.py\n@@ -1,4 +1,8 @@\n \n+import os\n+\n+# NOTE: make sure to have a the "." in the file extension (if applicable)\n+IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"]\n \n \n def parse_git_diff_file(diff_file):\n@@ -13,10 +17,21 @@ def parse_git_diff_file(diff_file):\n if current_file:\n # Add the previous diff to the dictionary\n diffs[current_file] = "".join(current_diff)\n+\n current_file = line.split()[-1]\n+ current_ext = os.path.splitext(current_file)[1]\n+\n+ if current_ext in IGNORE_FILE_EXTENSIONS:\n+ # Ignore this file\n+ current_file = None\n+ current_diff = []\n+ continue\n+\n current_diff = [line]\n else:\n- current_diff.append(line)\n+ # skip lines if we are ignoring this file (TODO - this is a bit hacky)\n+ if current_file:\n+ current_diff.append(line)\n \n # Add the last diff to the dictionary\n if current_file:\n@@ -25,7 +40,9 @@ def parse_git_diff_file(diff_file):\n return diffs\n \n \n+\n diffs = parse_git_diff_file("diff.txt")\n-for filename, diff in diffs.items():\n- print(f"Diff for {filename}:")\n- print(diff)\n\\ No newline at end of file\n+# for filename, diff in diffs.items():\n+# print(f"Diff for {filename}:")\n+# print(diff)\n+print(list(diffs.keys()))\n\\ No newline at end of file\n', + } + + assert diffs == expected diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py new file mode 100644 index 0000000..c9d0d27 --- /dev/null +++ b/mindflow/utils/diff_parser.py @@ -0,0 +1,39 @@ +import os + +# NOTE: make sure to have a the "." in the file extension (if applicable) +IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"] + + +def parse_git_diff_file(diff_file): + diffs = {} + current_file = None + current_diff = [] + + with open(diff_file, "r") as f: + for line in f: + if line.startswith("diff --git"): + # Starting a new file + if current_file: + # Add the previous diff to the dictionary + diffs[current_file] = "".join(current_diff) + + current_file = line.split()[-1] + current_ext = os.path.splitext(current_file)[1] + + if current_ext in IGNORE_FILE_EXTENSIONS: + # Ignore this file + current_file = None + current_diff = [] + continue + + current_diff = [line] + else: + # skip lines if we are ignoring this file (TODO - this is a bit hacky) + if current_file: + current_diff.append(line) + + # Add the last diff to the dictionary + if current_file: + diffs[current_file] = "".join(current_diff) + + return diffs From 7885b1254cfe883d55a347780fbe4b90fe248afd Mon Sep 17 00:00:00 2001 From: nollied Date: Wed, 8 Mar 2023 00:45:39 +0000 Subject: [PATCH 2/3] Modified diff parsing to exclude certain file extensions and added excluded file list to output. Co-authored-by: MindFlow --- mindflow/core/git/diff.py | 35 +-- mindflow/test.ipynb | 52 ---- mindflow/unit_tests/dummy_diff.txt | 403 +++++++++++++++++++++++------ mindflow/unit_tests/test_utils.py | 17 +- mindflow/utils/diff_parser.py | 61 ++--- 5 files changed, 386 insertions(+), 182 deletions(-) delete mode 100644 mindflow/test.ipynb diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py index 4931684..e71dfc2 100644 --- a/mindflow/core/git/diff.py +++ b/mindflow/core/git/diff.py @@ -13,6 +13,8 @@ from mindflow.utils.prompt_builders import build_context_prompt from mindflow.utils.prompts import GIT_DIFF_PROMPT_PREFIX +from mindflow.utils.diff_parser import parse_git_diff, IGNORE_FILE_EXTENSIONS + def run_diff(args: Tuple[str]) -> str: """ @@ -25,12 +27,16 @@ def run_diff(args: Tuple[str]) -> str: # Execute the git diff command and retrieve the output as a string diff_result = subprocess.check_output(command).decode("utf-8") - if diff_result.strip() == "": return "No staged changes." + diff_dict, excluded_filenames = parse_git_diff(diff_result) + + if len(diff_dict) <= 0: + return "No staged changes." + batched_parsed_diff_result = batch_git_diffs( - parse_git_diff(diff_result), token_limit=completion_model.hard_token_limit + diff_dict, token_limit=completion_model.hard_token_limit ) response: str = "" @@ -58,37 +64,22 @@ def run_diff(args: Tuple[str]) -> str: for future in concurrent.futures.as_completed(futures): response += future.result() + if len(excluded_filenames) > 0: + response += f"\n\nNOTE: The following files were excluded from the diff: {', '.join(excluded_filenames)}" + return response import re -def parse_git_diff(diff_output: str) -> List[Tuple[str, str]]: - file_diffs: List[Dict[str, List[str]]] = [] - current_diff: Optional[Dict[str, List[str]]] = None - for line in diff_output.split("\n"): - if line.startswith("diff --git"): - if current_diff is not None: - file_diffs.append(current_diff) - current_diff = {"file_name": None, "content": []} # type: ignore - match = re.match(r"^diff --git a/(.+?) b/.+?$", line) - if match: - current_diff["file_name"] = match.group(1) # type: ignore - if current_diff is not None: - current_diff["content"].append(line) - if current_diff is not None: - file_diffs.append(current_diff) - return [(diff["file_name"], "\n".join(diff["content"])) for diff in file_diffs] # type: ignore - - def batch_git_diffs( - file_diffs: List[Tuple[str, str]], token_limit: int + file_diffs: List[Dict[str, str]], token_limit: int ) -> List[List[Tuple[str, str]]]: batches = [] current_batch: List = [] current_batch_size = 0 - for file_name, diff_content in file_diffs: + for file_name, diff_content in file_diffs.items(): if len(diff_content) > token_limit: chunks = [ diff_content[i : i + token_limit] diff --git a/mindflow/test.ipynb b/mindflow/test.ipynb deleted file mode 100644 index 19d8f22..0000000 --- a/mindflow/test.ipynb +++ /dev/null @@ -1,52 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "asdfasdf\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "asdfasdf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/mindflow/unit_tests/dummy_diff.txt b/mindflow/unit_tests/dummy_diff.txt index e3afdd6..a07226e 100644 --- a/mindflow/unit_tests/dummy_diff.txt +++ b/mindflow/unit_tests/dummy_diff.txt @@ -1,94 +1,347 @@ -diff --git a/diff.txt b/diff.txt -index de79139..e69de29 100644 ---- a/diff.txt -+++ b/diff.txt -@@ -1,38 +0,0 @@ +diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py +index 4931684..e8e414b 100644 +--- a/mindflow/core/git/diff.py ++++ b/mindflow/core/git/diff.py +@@ -13,6 +13,8 @@ from mindflow.settings import Settings + from mindflow.utils.prompt_builders import build_context_prompt + from mindflow.utils.prompts import GIT_DIFF_PROMPT_PREFIX + ++from mindflow.utils.diff_parser import parse_git_diff, IGNORE_FILE_EXTENSIONS ++ + + def run_diff(args: Tuple[str]) -> str: + """ +@@ -25,12 +27,17 @@ def run_diff(args: Tuple[str]) -> str: + + # Execute the git diff command and retrieve the output as a string + diff_result = subprocess.check_output(command).decode("utf-8") +- + if diff_result.strip() == "": + return "No staged changes." + ++ ++ diff_dict, excluded_filenames = parse_git_diff(diff_result) ++ ++ if len(diff_dict) <= 0: ++ return "No staged changes." ++ + batched_parsed_diff_result = batch_git_diffs( +- parse_git_diff(diff_result), token_limit=completion_model.hard_token_limit ++ diff_dict, token_limit=completion_model.hard_token_limit + ) + + response: str = "" +@@ -58,37 +65,22 @@ def run_diff(args: Tuple[str]) -> str: + for future in concurrent.futures.as_completed(futures): + response += future.result() + ++ if len(excluded_filenames) > 0: ++ response += f"\n\nNOTE: The following files were excluded from the diff: {', '.join(excluded_filenames)}" ++ + return response + + + import re + + +-def parse_git_diff(diff_output: str) -> List[Tuple[str, str]]: +- file_diffs: List[Dict[str, List[str]]] = [] +- current_diff: Optional[Dict[str, List[str]]] = None +- for line in diff_output.split("\n"): +- if line.startswith("diff --git"): +- if current_diff is not None: +- file_diffs.append(current_diff) +- current_diff = {"file_name": None, "content": []} # type: ignore +- match = re.match(r"^diff --git a/(.+?) b/.+?$", line) +- if match: +- current_diff["file_name"] = match.group(1) # type: ignore +- if current_diff is not None: +- current_diff["content"].append(line) +- if current_diff is not None: +- file_diffs.append(current_diff) +- return [(diff["file_name"], "\n".join(diff["content"])) for diff in file_diffs] # type: ignore +- +- + def batch_git_diffs( +- file_diffs: List[Tuple[str, str]], token_limit: int ++ file_diffs: List[Dict[str, str]], token_limit: int + ) -> List[List[Tuple[str, str]]]: + batches = [] + current_batch: List = [] + current_batch_size = 0 +- for file_name, diff_content in file_diffs: ++ for file_name, diff_content in file_diffs.items(): + if len(diff_content) > token_limit: + chunks = [ + diff_content[i : i + token_limit] +diff --git a/mindflow/test.ipynb b/mindflow/test.ipynb +deleted file mode 100644 +index 19d8f22..0000000 +--- a/mindflow/test.ipynb ++++ /dev/null +@@ -1,52 +0,0 @@ +-{ +- "cells": [ +- { +- "cell_type": "code", +- "execution_count": null, +- "metadata": {}, +- "outputs": [], +- "source": [ +- "asdfasdf\n", +- "\n" +- ] +- }, +- { +- "cell_type": "code", +- "execution_count": null, +- "metadata": {}, +- "outputs": [], +- "source": [] +- }, +- { +- "cell_type": "code", +- "execution_count": null, +- "metadata": {}, +- "outputs": [], +- "source": [] +- }, +- { +- "cell_type": "code", +- "execution_count": null, +- "metadata": {}, +- "outputs": [], +- "source": [ +- "asdfasdf" +- ] +- }, +- { +- "cell_type": "code", +- "execution_count": null, +- "metadata": {}, +- "outputs": [], +- "source": [] +- } +- ], +- "metadata": { +- "language_info": { +- "name": "python" +- }, +- "orig_nbformat": 4 +- }, +- "nbformat": 4, +- "nbformat_minor": 2 +-} +diff --git a/mindflow/unit_tests/dummy_diff.txt b/mindflow/unit_tests/dummy_diff.txt +index e3afdd6..e69de29 100644 +--- a/mindflow/unit_tests/dummy_diff.txt ++++ b/mindflow/unit_tests/dummy_diff.txt +@@ -1,94 +0,0 @@ +-diff --git a/diff.txt b/diff.txt +-index de79139..e69de29 100644 +---- a/diff.txt +-+++ b/diff.txt +-@@ -1,38 +0,0 @@ +--diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py +--new file mode 100644 +--index 0000000..33a588f +----- /dev/null +--+++ b/mindflow/utils/diff_parser.py +--@@ -0,0 +1,31 @@ +--+ +--+ +--+ +--+def parse_git_diff_file(diff_file): +--+ diffs = {} +--+ current_file = None +--+ current_diff = [] +--+ +--+ with open(diff_file, "r") as f: +--+ for line in f: +--+ if line.startswith("diff --git"): +--+ # Starting a new file +--+ if current_file: +--+ # Add the previous diff to the dictionary +--+ diffs[current_file] = "".join(current_diff) +--+ current_file = line.split()[-1] +--+ current_diff = [line] +--+ else: +--+ current_diff.append(line) +--+ +--+ # Add the last diff to the dictionary +--+ if current_file: +--+ diffs[current_file] = "".join(current_diff) +--+ +--+ return diffs +--+ +--+ +--+diffs = parse_git_diff_file("diff.txt") +--+for filename, diff in diffs.items(): +--+ print(f"Diff for {filename}:") +--+ print(diff) +--\ No newline at end of file -diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py --new file mode 100644 --index 0000000..33a588f ----- /dev/null +-index 33a588f..bfb9b92 100644 +---- a/mindflow/utils/diff_parser.py -+++ b/mindflow/utils/diff_parser.py --@@ -0,0 +1,31 @@ +-@@ -1,4 +1,8 @@ +- +-+import os -+ +-+# NOTE: make sure to have a the "." in the file extension (if applicable) +-+IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"] +- +- +- def parse_git_diff_file(diff_file): +-@@ -13,10 +17,21 @@ def parse_git_diff_file(diff_file): +- if current_file: +- # Add the previous diff to the dictionary +- diffs[current_file] = "".join(current_diff) -+ +- current_file = line.split()[-1] +-+ current_ext = os.path.splitext(current_file)[1] -+ --+def parse_git_diff_file(diff_file): --+ diffs = {} --+ current_file = None --+ current_diff = [] +-+ if current_ext in IGNORE_FILE_EXTENSIONS: +-+ # Ignore this file +-+ current_file = None +-+ current_diff = [] +-+ continue -+ --+ with open(diff_file, "r") as f: --+ for line in f: --+ if line.startswith("diff --git"): --+ # Starting a new file +- current_diff = [line] +- else: +-- current_diff.append(line) +-+ # skip lines if we are ignoring this file (TODO - this is a bit hacky) -+ if current_file: --+ # Add the previous diff to the dictionary --+ diffs[current_file] = "".join(current_diff) --+ current_file = line.split()[-1] --+ current_diff = [line] --+ else: --+ current_diff.append(line) --+ --+ # Add the last diff to the dictionary --+ if current_file: --+ diffs[current_file] = "".join(current_diff) --+ --+ return diffs --+ +-+ current_diff.append(line) +- +- # Add the last diff to the dictionary +- if current_file: +-@@ -25,7 +40,9 @@ def parse_git_diff_file(diff_file): +- return diffs +- +- -+ --+diffs = parse_git_diff_file("diff.txt") --+for filename, diff in diffs.items(): --+ print(f"Diff for {filename}:") --+ print(diff) +- diffs = parse_git_diff_file("diff.txt") +--for filename, diff in diffs.items(): +-- print(f"Diff for {filename}:") +-- print(diff) -\ No newline at end of file +-+# for filename, diff in diffs.items(): +-+# print(f"Diff for {filename}:") +-+# print(diff) +-+print(list(diffs.keys())) +-\ No newline at end of file +diff --git a/mindflow/unit_tests/test_utils.py b/mindflow/unit_tests/test_utils.py +index cacf274..4f1b9fc 100644 +--- a/mindflow/unit_tests/test_utils.py ++++ b/mindflow/unit_tests/test_utils.py +@@ -1,8 +1,14 @@ +-from mindflow.utils.diff_parser import parse_git_diff_file ++from mindflow.utils.diff_parser import parse_git_diff + + + def test_diff_parser(): +- diffs = parse_git_diff_file("mindflow/unit_tests/dummy_diff.txt") ++ ++ diff = open("mindflow/unit_tests/dummy_diff.txt", "r").read() ++ diffs, excluded_files = parse_git_diff(diff) ++ ++ assert excluded_files == "b/" ++ ++ print(list(diffs.keys())) + + expected = { + "b/diff.txt": 'diff --git a/diff.txt b/diff.txt\nindex de79139..e69de29 100644\n--- a/diff.txt\n+++ b/diff.txt\n@@ -1,38 +0,0 @@\n-diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\n-new file mode 100644\n-index 0000000..33a588f\n---- /dev/null\n-+++ b/mindflow/utils/diff_parser.py\n-@@ -0,0 +1,31 @@\n-+\n-+\n-+\n-+def parse_git_diff_file(diff_file):\n-+ diffs = {}\n-+ current_file = None\n-+ current_diff = []\n-+\n-+ with open(diff_file, "r") as f:\n-+ for line in f:\n-+ if line.startswith("diff --git"):\n-+ # Starting a new file\n-+ if current_file:\n-+ # Add the previous diff to the dictionary\n-+ diffs[current_file] = "".join(current_diff)\n-+ current_file = line.split()[-1]\n-+ current_diff = [line]\n-+ else:\n-+ current_diff.append(line)\n-+\n-+ # Add the last diff to the dictionary\n-+ if current_file:\n-+ diffs[current_file] = "".join(current_diff)\n-+\n-+ return diffs\n-+\n-+\n-+diffs = parse_git_diff_file("diff.txt")\n-+for filename, diff in diffs.items():\n-+ print(f"Diff for {filename}:")\n-+ print(diff)\n-\\ No newline at end of file\n', diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py -index 33a588f..bfb9b92 100644 +index c9d0d27..1d7b094 100644 --- a/mindflow/utils/diff_parser.py +++ b/mindflow/utils/diff_parser.py -@@ -1,4 +1,8 @@ +@@ -4,36 +4,59 @@ import os + IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"] -+import os -+ -+# NOTE: make sure to have a the "." in the file extension (if applicable) -+IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"] +-def parse_git_diff_file(diff_file): ++def parse_git_diff(diff_str: str): + diffs = {} + current_file = None + current_diff = [] - def parse_git_diff_file(diff_file): -@@ -13,10 +17,21 @@ def parse_git_diff_file(diff_file): - if current_file: - # Add the previous diff to the dictionary - diffs[current_file] = "".join(current_diff) +- with open(diff_file, "r") as f: +- for line in f: +- if line.startswith("diff --git"): +- # Starting a new file +- if current_file: +- # Add the previous diff to the dictionary +- diffs[current_file] = "".join(current_diff) +- +- current_file = line.split()[-1] +- current_ext = os.path.splitext(current_file)[1] +- +- if current_ext in IGNORE_FILE_EXTENSIONS: +- # Ignore this file +- current_file = None +- current_diff = [] +- continue +- +- current_diff = [line] +- else: +- # skip lines if we are ignoring this file (TODO - this is a bit hacky) +- if current_file: +- current_diff.append(line) +- +- # Add the last diff to the dictionary +- if current_file: +- diffs[current_file] = "".join(current_diff) +- +- return diffs ++ excluded_files = [] + - current_file = line.split()[-1] -+ current_ext = os.path.splitext(current_file)[1] ++ for line in diff_str.splitlines(keepends=True): ++ if line.startswith("diff --git"): ++ # Starting a new file ++ if current_file: ++ # Add the previous diff to the dictionary ++ diffs[current_file] = "".join(current_diff) + -+ if current_ext in IGNORE_FILE_EXTENSIONS: -+ # Ignore this file -+ current_file = None -+ current_diff = [] -+ continue ++ current_file = line.split()[-1] ++ current_ext = os.path.splitext(current_file)[1] + - current_diff = [line] - else: -- current_diff.append(line) -+ # skip lines if we are ignoring this file (TODO - this is a bit hacky) -+ if current_file: -+ current_diff.append(line) - - # Add the last diff to the dictionary - if current_file: -@@ -25,7 +40,9 @@ def parse_git_diff_file(diff_file): - return diffs - - ++ if current_ext in IGNORE_FILE_EXTENSIONS: ++ excluded_files.append(current_file) ++ ++ # Ignore this file ++ current_file = None ++ current_diff = [] ++ continue ++ ++ current_diff = [line] ++ else: ++ # skip lines if we are ignoring this file (TODO - this is a bit hacky) ++ if current_file: ++ current_diff.append(line) ++ ++ # Add the last diff to the dictionary ++ if current_file: ++ diffs[current_file] = "".join(current_diff) ++ ++ return diffs, excluded_files ++ ++ ++# Old implementation: ++# def parse_git_diff(diff_output: str) -> List[Tuple[str, str]]: ++# file_diffs: List[Dict[str, List[str]]] = [] ++# current_diff: Optional[Dict[str, List[str]]] = None ++# for line in diff_output.split("\n"): ++# if line.startswith("diff --git"): ++# if current_diff is not None: ++# file_diffs.append(current_diff) ++# current_diff = {"file_name": None, "content": []} # type: ignore ++# match = re.match(r"^diff --git a/(.+?) b/.+?$", line) ++# if match: ++# current_diff["file_name"] = match.group(1) # type: ignore ++# if current_diff is not None: ++# current_diff["content"].append(line) ++# if current_diff is not None: ++# file_diffs.append(current_diff) ++# return [(diff["file_name"], "\n".join(diff["content"])) for diff in file_diffs] # type: ignore + - diffs = parse_git_diff_file("diff.txt") --for filename, diff in diffs.items(): -- print(f"Diff for {filename}:") -- print(diff) -\ No newline at end of file -+# for filename, diff in diffs.items(): -+# print(f"Diff for {filename}:") -+# print(diff) -+print(list(diffs.keys())) -\ No newline at end of file diff --git a/mindflow/unit_tests/test_utils.py b/mindflow/unit_tests/test_utils.py index cacf274..2d1de0f 100644 --- a/mindflow/unit_tests/test_utils.py +++ b/mindflow/unit_tests/test_utils.py @@ -1,12 +1,21 @@ -from mindflow.utils.diff_parser import parse_git_diff_file +from mindflow.utils.diff_parser import parse_git_diff def test_diff_parser(): - diffs = parse_git_diff_file("mindflow/unit_tests/dummy_diff.txt") + diff = open("mindflow/unit_tests/dummy_diff.txt", "r").read() + diffs, excluded_files = parse_git_diff(diff) + + assert excluded_files == ["b/mindflow/test.ipynb"] + + print(list(diffs.keys())) + print(excluded_files) + # print(diffs) expected = { - "b/diff.txt": 'diff --git a/diff.txt b/diff.txt\nindex de79139..e69de29 100644\n--- a/diff.txt\n+++ b/diff.txt\n@@ -1,38 +0,0 @@\n-diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\n-new file mode 100644\n-index 0000000..33a588f\n---- /dev/null\n-+++ b/mindflow/utils/diff_parser.py\n-@@ -0,0 +1,31 @@\n-+\n-+\n-+\n-+def parse_git_diff_file(diff_file):\n-+ diffs = {}\n-+ current_file = None\n-+ current_diff = []\n-+\n-+ with open(diff_file, "r") as f:\n-+ for line in f:\n-+ if line.startswith("diff --git"):\n-+ # Starting a new file\n-+ if current_file:\n-+ # Add the previous diff to the dictionary\n-+ diffs[current_file] = "".join(current_diff)\n-+ current_file = line.split()[-1]\n-+ current_diff = [line]\n-+ else:\n-+ current_diff.append(line)\n-+\n-+ # Add the last diff to the dictionary\n-+ if current_file:\n-+ diffs[current_file] = "".join(current_diff)\n-+\n-+ return diffs\n-+\n-+\n-+diffs = parse_git_diff_file("diff.txt")\n-+for filename, diff in diffs.items():\n-+ print(f"Diff for {filename}:")\n-+ print(diff)\n-\\ No newline at end of file\n', - "b/mindflow/utils/diff_parser.py": 'diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\nindex 33a588f..bfb9b92 100644\n--- a/mindflow/utils/diff_parser.py\n+++ b/mindflow/utils/diff_parser.py\n@@ -1,4 +1,8 @@\n \n+import os\n+\n+# NOTE: make sure to have a the "." in the file extension (if applicable)\n+IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"]\n \n \n def parse_git_diff_file(diff_file):\n@@ -13,10 +17,21 @@ def parse_git_diff_file(diff_file):\n if current_file:\n # Add the previous diff to the dictionary\n diffs[current_file] = "".join(current_diff)\n+\n current_file = line.split()[-1]\n+ current_ext = os.path.splitext(current_file)[1]\n+\n+ if current_ext in IGNORE_FILE_EXTENSIONS:\n+ # Ignore this file\n+ current_file = None\n+ current_diff = []\n+ continue\n+\n current_diff = [line]\n else:\n- current_diff.append(line)\n+ # skip lines if we are ignoring this file (TODO - this is a bit hacky)\n+ if current_file:\n+ current_diff.append(line)\n \n # Add the last diff to the dictionary\n if current_file:\n@@ -25,7 +40,9 @@ def parse_git_diff_file(diff_file):\n return diffs\n \n \n+\n diffs = parse_git_diff_file("diff.txt")\n-for filename, diff in diffs.items():\n- print(f"Diff for {filename}:")\n- print(diff)\n\\ No newline at end of file\n+# for filename, diff in diffs.items():\n+# print(f"Diff for {filename}:")\n+# print(diff)\n+print(list(diffs.keys()))\n\\ No newline at end of file\n', + "b/mindflow/core/git/diff.py": 'diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py\nindex 4931684..e8e414b 100644\n--- a/mindflow/core/git/diff.py\n+++ b/mindflow/core/git/diff.py\n@@ -13,6 +13,8 @@ from mindflow.settings import Settings\n from mindflow.utils.prompt_builders import build_context_prompt\n from mindflow.utils.prompts import GIT_DIFF_PROMPT_PREFIX\n \n+from mindflow.utils.diff_parser import parse_git_diff, IGNORE_FILE_EXTENSIONS\n+\n \n def run_diff(args: Tuple[str]) -> str:\n """\n@@ -25,12 +27,17 @@ def run_diff(args: Tuple[str]) -> str:\n \n # Execute the git diff command and retrieve the output as a string\n diff_result = subprocess.check_output(command).decode("utf-8")\n-\n if diff_result.strip() == "":\n return "No staged changes."\n \n+ \n+ diff_dict, excluded_filenames = parse_git_diff(diff_result)\n+\n+ if len(diff_dict) <= 0:\n+ return "No staged changes."\n+\n batched_parsed_diff_result = batch_git_diffs(\n- parse_git_diff(diff_result), token_limit=completion_model.hard_token_limit\n+ diff_dict, token_limit=completion_model.hard_token_limit\n )\n \n response: str = ""\n@@ -58,37 +65,22 @@ def run_diff(args: Tuple[str]) -> str:\n for future in concurrent.futures.as_completed(futures):\n response += future.result()\n \n+ if len(excluded_filenames) > 0:\n+ response += f"\\n\\nNOTE: The following files were excluded from the diff: {\', \'.join(excluded_filenames)}"\n+\n return response\n \n \n import re\n \n \n-def parse_git_diff(diff_output: str) -> List[Tuple[str, str]]:\n- file_diffs: List[Dict[str, List[str]]] = []\n- current_diff: Optional[Dict[str, List[str]]] = None\n- for line in diff_output.split("\\n"):\n- if line.startswith("diff --git"):\n- if current_diff is not None:\n- file_diffs.append(current_diff)\n- current_diff = {"file_name": None, "content": []} # type: ignore\n- match = re.match(r"^diff --git a/(.+?) b/.+?$", line)\n- if match:\n- current_diff["file_name"] = match.group(1) # type: ignore\n- if current_diff is not None:\n- current_diff["content"].append(line)\n- if current_diff is not None:\n- file_diffs.append(current_diff)\n- return [(diff["file_name"], "\\n".join(diff["content"])) for diff in file_diffs] # type: ignore\n-\n-\n def batch_git_diffs(\n- file_diffs: List[Tuple[str, str]], token_limit: int\n+ file_diffs: List[Dict[str, str]], token_limit: int\n ) -> List[List[Tuple[str, str]]]:\n batches = []\n current_batch: List = []\n current_batch_size = 0\n- for file_name, diff_content in file_diffs:\n+ for file_name, diff_content in file_diffs.items():\n if len(diff_content) > token_limit:\n chunks = [\n diff_content[i : i + token_limit]\n', + "b/mindflow/unit_tests/dummy_diff.txt": 'diff --git a/mindflow/unit_tests/dummy_diff.txt b/mindflow/unit_tests/dummy_diff.txt\nindex e3afdd6..e69de29 100644\n--- a/mindflow/unit_tests/dummy_diff.txt\n+++ b/mindflow/unit_tests/dummy_diff.txt\n@@ -1,94 +0,0 @@\n-diff --git a/diff.txt b/diff.txt\n-index de79139..e69de29 100644\n---- a/diff.txt\n-+++ b/diff.txt\n-@@ -1,38 +0,0 @@\n--diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\n--new file mode 100644\n--index 0000000..33a588f\n----- /dev/null\n--+++ b/mindflow/utils/diff_parser.py\n--@@ -0,0 +1,31 @@\n--+\n--+\n--+\n--+def parse_git_diff_file(diff_file):\n--+ diffs = {}\n--+ current_file = None\n--+ current_diff = []\n--+\n--+ with open(diff_file, "r") as f:\n--+ for line in f:\n--+ if line.startswith("diff --git"):\n--+ # Starting a new file\n--+ if current_file:\n--+ # Add the previous diff to the dictionary\n--+ diffs[current_file] = "".join(current_diff)\n--+ current_file = line.split()[-1]\n--+ current_diff = [line]\n--+ else:\n--+ current_diff.append(line)\n--+\n--+ # Add the last diff to the dictionary\n--+ if current_file:\n--+ diffs[current_file] = "".join(current_diff)\n--+\n--+ return diffs\n--+\n--+\n--+diffs = parse_git_diff_file("diff.txt")\n--+for filename, diff in diffs.items():\n--+ print(f"Diff for {filename}:")\n--+ print(diff)\n--\\ No newline at end of file\n-diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\n-index 33a588f..bfb9b92 100644\n---- a/mindflow/utils/diff_parser.py\n-+++ b/mindflow/utils/diff_parser.py\n-@@ -1,4 +1,8 @@\n- \n-+import os\n-+\n-+# NOTE: make sure to have a the "." in the file extension (if applicable)\n-+IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"]\n- \n- \n- def parse_git_diff_file(diff_file):\n-@@ -13,10 +17,21 @@ def parse_git_diff_file(diff_file):\n- if current_file:\n- # Add the previous diff to the dictionary\n- diffs[current_file] = "".join(current_diff)\n-+\n- current_file = line.split()[-1]\n-+ current_ext = os.path.splitext(current_file)[1]\n-+\n-+ if current_ext in IGNORE_FILE_EXTENSIONS:\n-+ # Ignore this file\n-+ current_file = None\n-+ current_diff = []\n-+ continue\n-+\n- current_diff = [line]\n- else:\n-- current_diff.append(line)\n-+ # skip lines if we are ignoring this file (TODO - this is a bit hacky)\n-+ if current_file:\n-+ current_diff.append(line)\n- \n- # Add the last diff to the dictionary\n- if current_file:\n-@@ -25,7 +40,9 @@ def parse_git_diff_file(diff_file):\n- return diffs\n- \n- \n-+\n- diffs = parse_git_diff_file("diff.txt")\n--for filename, diff in diffs.items():\n-- print(f"Diff for {filename}:")\n-- print(diff)\n-\\ No newline at end of file\n-+# for filename, diff in diffs.items():\n-+# print(f"Diff for {filename}:")\n-+# print(diff)\n-+print(list(diffs.keys()))\n-\\ No newline at end of file\n', + "b/mindflow/unit_tests/test_utils.py": 'diff --git a/mindflow/unit_tests/test_utils.py b/mindflow/unit_tests/test_utils.py\nindex cacf274..4f1b9fc 100644\n--- a/mindflow/unit_tests/test_utils.py\n+++ b/mindflow/unit_tests/test_utils.py\n@@ -1,8 +1,14 @@\n-from mindflow.utils.diff_parser import parse_git_diff_file\n+from mindflow.utils.diff_parser import parse_git_diff\n \n \n def test_diff_parser():\n- diffs = parse_git_diff_file("mindflow/unit_tests/dummy_diff.txt")\n+\n+ diff = open("mindflow/unit_tests/dummy_diff.txt", "r").read()\n+ diffs, excluded_files = parse_git_diff(diff)\n+\n+ assert excluded_files == "b/"\n+\n+ print(list(diffs.keys()))\n \n expected = {\n "b/diff.txt": \'diff --git a/diff.txt b/diff.txt\\nindex de79139..e69de29 100644\\n--- a/diff.txt\\n+++ b/diff.txt\\n@@ -1,38 +0,0 @@\\n-diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\\n-new file mode 100644\\n-index 0000000..33a588f\\n---- /dev/null\\n-+++ b/mindflow/utils/diff_parser.py\\n-@@ -0,0 +1,31 @@\\n-+\\n-+\\n-+\\n-+def parse_git_diff_file(diff_file):\\n-+ diffs = {}\\n-+ current_file = None\\n-+ current_diff = []\\n-+\\n-+ with open(diff_file, "r") as f:\\n-+ for line in f:\\n-+ if line.startswith("diff --git"):\\n-+ # Starting a new file\\n-+ if current_file:\\n-+ # Add the previous diff to the dictionary\\n-+ diffs[current_file] = "".join(current_diff)\\n-+ current_file = line.split()[-1]\\n-+ current_diff = [line]\\n-+ else:\\n-+ current_diff.append(line)\\n-+\\n-+ # Add the last diff to the dictionary\\n-+ if current_file:\\n-+ diffs[current_file] = "".join(current_diff)\\n-+\\n-+ return diffs\\n-+\\n-+\\n-+diffs = parse_git_diff_file("diff.txt")\\n-+for filename, diff in diffs.items():\\n-+ print(f"Diff for {filename}:")\\n-+ print(diff)\\n-\\\\ No newline at end of file\\n\',\n', + "b/mindflow/utils/diff_parser.py": 'diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py\nindex c9d0d27..1d7b094 100644\n--- a/mindflow/utils/diff_parser.py\n+++ b/mindflow/utils/diff_parser.py\n@@ -4,36 +4,59 @@ import os\n IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"]\n \n \n-def parse_git_diff_file(diff_file):\n+def parse_git_diff(diff_str: str):\n diffs = {}\n current_file = None\n current_diff = []\n \n- with open(diff_file, "r") as f:\n- for line in f:\n- if line.startswith("diff --git"):\n- # Starting a new file\n- if current_file:\n- # Add the previous diff to the dictionary\n- diffs[current_file] = "".join(current_diff)\n-\n- current_file = line.split()[-1]\n- current_ext = os.path.splitext(current_file)[1]\n-\n- if current_ext in IGNORE_FILE_EXTENSIONS:\n- # Ignore this file\n- current_file = None\n- current_diff = []\n- continue\n-\n- current_diff = [line]\n- else:\n- # skip lines if we are ignoring this file (TODO - this is a bit hacky)\n- if current_file:\n- current_diff.append(line)\n-\n- # Add the last diff to the dictionary\n- if current_file:\n- diffs[current_file] = "".join(current_diff)\n-\n- return diffs\n+ excluded_files = []\n+\n+ for line in diff_str.splitlines(keepends=True):\n+ if line.startswith("diff --git"):\n+ # Starting a new file\n+ if current_file:\n+ # Add the previous diff to the dictionary\n+ diffs[current_file] = "".join(current_diff)\n+\n+ current_file = line.split()[-1]\n+ current_ext = os.path.splitext(current_file)[1]\n+\n+ if current_ext in IGNORE_FILE_EXTENSIONS:\n+ excluded_files.append(current_file)\n+\n+ # Ignore this file\n+ current_file = None\n+ current_diff = []\n+ continue\n+\n+ current_diff = [line]\n+ else:\n+ # skip lines if we are ignoring this file (TODO - this is a bit hacky)\n+ if current_file:\n+ current_diff.append(line)\n+\n+ # Add the last diff to the dictionary\n+ if current_file:\n+ diffs[current_file] = "".join(current_diff)\n+\n+ return diffs, excluded_files\n+\n+\n+# Old implementation:\n+# def parse_git_diff(diff_output: str) -> List[Tuple[str, str]]:\n+# file_diffs: List[Dict[str, List[str]]] = []\n+# current_diff: Optional[Dict[str, List[str]]] = None\n+# for line in diff_output.split("\\n"):\n+# if line.startswith("diff --git"):\n+# if current_diff is not None:\n+# file_diffs.append(current_diff)\n+# current_diff = {"file_name": None, "content": []} # type: ignore\n+# match = re.match(r"^diff --git a/(.+?) b/.+?$", line)\n+# if match:\n+# current_diff["file_name"] = match.group(1) # type: ignore\n+# if current_diff is not None:\n+# current_diff["content"].append(line)\n+# if current_diff is not None:\n+# file_diffs.append(current_diff)\n+# return [(diff["file_name"], "\\n".join(diff["content"])) for diff in file_diffs] # type: ignore\n+\n', } assert diffs == expected diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py index c9d0d27..44739f6 100644 --- a/mindflow/utils/diff_parser.py +++ b/mindflow/utils/diff_parser.py @@ -4,36 +4,39 @@ IGNORE_FILE_EXTENSIONS = [".pyc", ".ipynb", ".ipynb_checkpoints"] -def parse_git_diff_file(diff_file): +def parse_git_diff(diff_str: str): diffs = {} current_file = None current_diff = [] - with open(diff_file, "r") as f: - for line in f: - if line.startswith("diff --git"): - # Starting a new file - if current_file: - # Add the previous diff to the dictionary - diffs[current_file] = "".join(current_diff) - - current_file = line.split()[-1] - current_ext = os.path.splitext(current_file)[1] - - if current_ext in IGNORE_FILE_EXTENSIONS: - # Ignore this file - current_file = None - current_diff = [] - continue - - current_diff = [line] - else: - # skip lines if we are ignoring this file (TODO - this is a bit hacky) - if current_file: - current_diff.append(line) - - # Add the last diff to the dictionary - if current_file: - diffs[current_file] = "".join(current_diff) - - return diffs + excluded_files = [] + + for line in diff_str.splitlines(keepends=True): + if line.startswith("diff --git"): + # Starting a new file + if current_file: + # Add the previous diff to the dictionary + diffs[current_file] = "".join(current_diff) + + current_file = line.split()[-1] + current_ext = os.path.splitext(current_file)[1] + + if current_ext in IGNORE_FILE_EXTENSIONS: + excluded_files.append(current_file) + + # Ignore this file + current_file = None + current_diff = [] + continue + + current_diff = [line] + else: + # skip lines if we are ignoring this file (TODO - this is a bit hacky) + if current_file: + current_diff.append(line) + + # Add the last diff to the dictionary + if current_file: + diffs[current_file] = "".join(current_diff) + + return diffs, excluded_files From 34f4c11a7ec6954f111a080a2b5ef9d03c2797e2 Mon Sep 17 00:00:00 2001 From: nollied Date: Wed, 8 Mar 2023 00:49:37 +0000 Subject: [PATCH 3/3] mypy --- mindflow/core/git/diff.py | 2 +- mindflow/utils/diff_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mindflow/core/git/diff.py b/mindflow/core/git/diff.py index e71dfc2..eddaa41 100644 --- a/mindflow/core/git/diff.py +++ b/mindflow/core/git/diff.py @@ -74,7 +74,7 @@ def run_diff(args: Tuple[str]) -> str: def batch_git_diffs( - file_diffs: List[Dict[str, str]], token_limit: int + file_diffs: Dict[str, str], token_limit: int ) -> List[List[Tuple[str, str]]]: batches = [] current_batch: List = [] diff --git a/mindflow/utils/diff_parser.py b/mindflow/utils/diff_parser.py index 44739f6..03636e8 100644 --- a/mindflow/utils/diff_parser.py +++ b/mindflow/utils/diff_parser.py @@ -7,7 +7,7 @@ def parse_git_diff(diff_str: str): diffs = {} current_file = None - current_diff = [] + current_diff = [] # type: ignore excluded_files = []