-
Notifications
You must be signed in to change notification settings - Fork 0
/
report_code_files.py
119 lines (100 loc) · 4.02 KB
/
report_code_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from argparse import ArgumentParser
import csv
import pandas as pd
from pathlib import Path
import sys
sys.path.append(str(Path(__file__).parent / "solver"))
sys.path.append(str(Path(__file__).parent / "submodules" / "navie-editor"))
from solver.workflow.patch import Patch
PARQUET_FILE = "SWE-bench_verified.parquet"
def main(directory: str):
# Read the parquet file
df = pd.read_parquet(PARQUET_FILE)
code_files = []
dir_path = Path(directory)
for search_output in dir_path.rglob("search.output.txt"):
with search_output.open() as f:
content = f.read()
instance_id = search_output.parent.parent.parent.parent.name
print(instance_id)
lines = content.splitlines()
# Strip everything from the path from the lowest "source" directory
def strip_path(path: str) -> str:
parts = path.split("/")
if "source" not in parts:
return path
source_index = parts.index("source")
return "/".join(parts[source_index + 1 :])
paths = [strip_path(path) for path in lines]
paths = sorted(paths)
code_files.append(
{"instance_id": instance_id, "code_files": "\n".join(paths)}
)
code_files_sorted = sorted(code_files, key=lambda x: x["instance_id"])
with (Path("data") / "instance_sets" / "verified.txt").open("r") as f:
verified_set = f.read().splitlines()
verified_set = sorted(verified_set)
report_file = Path("code_files.csv")
with report_file.open("w") as f:
writer = csv.DictWriter(
f,
fieldnames=[
"instance_id",
"code_files",
"gold_code_files",
"code_files_match",
"code_files_extra",
"code_files_missing",
],
)
writer.writeheader()
code_file_index = 0
for instance_id in verified_set:
# Filter the dataframe for the specific instance_id and print the "patch" field
patch_str = df.loc[df["instance_id"] == instance_id, "patch"]
if not len(patch_str):
print(f"No patch found for instance_id: {instance_id}")
# Get the first patch item
patch = Patch(patch_str.iloc[0])
print(f"Patch files: {patch.list_files()}")
if (
code_file_index < len(code_files_sorted)
and code_files_sorted[code_file_index]["instance_id"] == instance_id
):
code_files_match = (
True
if patch.list_files()
== code_files_sorted[code_file_index]["code_files"].splitlines()
else False
)
code_files_extra = len(
set(patch.list_files())
- set(code_files_sorted[code_file_index]["code_files"].splitlines())
)
code_files_missing = len(
set(code_files_sorted[code_file_index]["code_files"].splitlines())
- set(patch.list_files())
)
writer.writerow(
{
"instance_id": instance_id,
"code_files": code_files_sorted[code_file_index]["code_files"],
"gold_code_files": "\n".join(patch.list_files()),
"code_files_match": code_files_match,
"code_files_extra": code_files_extra,
"code_files_missing": code_files_missing,
}
)
code_file_index += 1
else:
writer.writerow({"instance_id": instance_id})
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"--directory",
type=str,
help="Directory to search for code files",
required=True,
)
args = parser.parse_args()
main(**vars(args))