From 05da890bdf564dff5baf232b36ad62244ef2b38b Mon Sep 17 00:00:00 2001
From: Rinat Zaripov <zaripov.rinat@gmail.com>
Date: Sun, 21 Jan 2024 19:18:29 +1100
Subject: [PATCH] feat: evaluate refactor outcomes using LLM to make decision
 of whether file edit should be accepted or discarded (#14)

---
 .changeset/witty-singers-sort.md              |  56 ++
 .gitignore                                    |   6 +-
 .refactor-bot/benchmarks/test-benchmark.yaml  |  22 +
 .../refactors/test-likely-failure/goal.md     |   4 +-
 README.md                                     |  25 +-
 package.json                                  |   1 +
 .../refactor-bot/src/benchmark/benchmark.ts   | 102 +++
 .../src/benchmark/benchmarkConfig.ts          |  61 ++
 packages/refactor-bot/src/benchmark/cli.ts    |  80 ++
 .../refactor-bot/src/benchmark/cliHandler.ts  |  12 +
 .../src/benchmark/example-config.yaml         |  22 +
 .../src/benchmark/loadBenchmarkConfig.ts      |  23 +
 .../src/benchmark/loadRefactorResult.ts       |  36 +
 .../src/benchmark/refactorResultSchema.ts     |  36 +
 .../src/benchmark/reportBenchmarkSummary.ts   |  68 ++
 .../refactor-bot/src/benchmark/runVariant.ts  | 204 +++++
 .../src/benchmark/summarizeCosts.ts           |  40 +
 .../src/benchmark/summarizeOutcomeScore.ts    |  56 ++
 .../src/benchmark/summarizePerformance.ts     |  21 +
 .../src/benchmark/summarizeRefactorResult.ts  |  49 ++
 packages/refactor-bot/src/bootstrap.ts        |  10 +-
 packages/refactor-bot/src/cache/cache.ts      |  50 +-
 packages/refactor-bot/src/cache/log.ts        |   9 +-
 .../src/cache/makeCachedFunction.ts           |   2 +
 .../src/cache/makeCachedObservable.test.ts    |   8 +-
 .../src/cache/makeCachedObservable.ts         |  21 +-
 .../refactor-bot/src/cache/persistence.ts     |  14 +-
 packages/refactor-bot/src/cache/state.ts      |  65 +-
 packages/refactor-bot/src/chat-gpt/api.ts     |  10 +-
 .../src/child-process/spawnResult.ts          |   5 +
 .../src/evaluate/evaluateFileChanges.ts       | 215 ++++++
 .../src/evaluate/evaluateFileScore.ts         |  85 +++
 .../src/evaluate/evaluateRefactorResult.ts    | 267 +++++++
 .../src/evaluate/evaluateUnchangedFile.ts     | 181 +++++
 .../src/evaluate/extractRequirements.ts       | 115 +++
 .../refactor-bot/src/evaluate/utils/avg.ts    |   5 +
 .../refactor-bot/src/evaluate/utils/sum.ts    |   3 +
 .../src/git/gitShowFileCommitSummary.ts       |  41 +
 .../refactor-bot/src/logger/asyncFormatter.ts |   2 +-
 .../refactor-bot/src/logger/formatters.ts     |  29 +
 .../src/package-manager/runCheckCommand.ts    |  25 +-
 packages/refactor-bot/src/playground.ts       |  69 +-
 .../src/prompt-formatters/formatBulletList.ts |  17 +
 .../formatFencedCodeBlock.ts                  |  26 +
 .../prompt-formatters/formatFileContents.ts   |  42 ++
 .../src/prompt-formatters/formatFileDiff.ts   |   8 +
 .../src/prompt-formatters/formatOptional.ts   |  12 +
 .../src/prompt-formatters/formatZodError.ts   |  19 +
 packages/refactor-bot/src/prompt/cli.ts       |   6 +
 packages/refactor-bot/src/prompt/run.ts       |   3 +-
 .../src/refactor/actions/acceptedEdit.ts      |   4 +-
 .../src/refactor/actions/discardedEdit.ts     |   4 +-
 packages/refactor-bot/src/refactor/check.ts   |   7 +-
 .../src/refactor/checkoutSandbox.ts           |   5 -
 packages/refactor-bot/src/refactor/cli.ts     |  21 +-
 .../src/refactor/collectLlmUsage.ts           | 145 ++++
 .../src/refactor/determineModelParameters.ts  |  32 +-
 .../refactor-bot/src/refactor/filesToEdit.ts  | 110 +++
 .../src/refactor/findRefactorStateLocation.ts |  44 ++
 .../src/refactor/planAndRefactor.ts           |  26 +-
 packages/refactor-bot/src/refactor/prompt.ts  | 133 +++-
 .../refactor-bot/src/refactor/refactor.ts     |  88 +--
 .../src/refactor/refactorBatch.ts             |  98 +--
 .../refactor-bot/src/refactor/refactorFile.ts | 710 ++++++++++--------
 .../refactor-bot/src/refactor/refactorGoal.ts |   1 +
 .../src/refactor/resetToLastAcceptedCommit.ts |   9 +-
 .../src/refactor/resultsCollector.ts          | 195 +++--
 .../src/refactor/retrieveParameters.ts        | 155 +---
 .../refactor-bot/src/refactor/runRefactor.ts  | 168 +++--
 packages/refactor-bot/src/refactor/types.ts   | 240 +++---
 .../parseFencedCodeBlocks.test.ts             | 142 ++++
 .../response-parsers/parseFencedCodeBlocks.ts |  60 ++
 .../parseJsonResponse.test.ts                 |  58 ++
 .../src/response-parsers/parseJsonResponse.ts |  52 ++
 packages/refactor-bot/src/utils/hasOne.ts     |   6 +-
 .../src/utils/isFileNotFoundError.ts          |   5 +
 .../refactor-bot/src/utils/parseJsonSchema.ts |  22 +
 packages/refactor-bot/tsconfig.json           |   2 +-
 78 files changed, 3872 insertions(+), 958 deletions(-)
 create mode 100644 .changeset/witty-singers-sort.md
 create mode 100644 .refactor-bot/benchmarks/test-benchmark.yaml
 create mode 100644 packages/refactor-bot/src/benchmark/benchmark.ts
 create mode 100644 packages/refactor-bot/src/benchmark/benchmarkConfig.ts
 create mode 100644 packages/refactor-bot/src/benchmark/cli.ts
 create mode 100644 packages/refactor-bot/src/benchmark/cliHandler.ts
 create mode 100644 packages/refactor-bot/src/benchmark/example-config.yaml
 create mode 100644 packages/refactor-bot/src/benchmark/loadBenchmarkConfig.ts
 create mode 100644 packages/refactor-bot/src/benchmark/loadRefactorResult.ts
 create mode 100644 packages/refactor-bot/src/benchmark/refactorResultSchema.ts
 create mode 100644 packages/refactor-bot/src/benchmark/reportBenchmarkSummary.ts
 create mode 100644 packages/refactor-bot/src/benchmark/runVariant.ts
 create mode 100644 packages/refactor-bot/src/benchmark/summarizeCosts.ts
 create mode 100644 packages/refactor-bot/src/benchmark/summarizeOutcomeScore.ts
 create mode 100644 packages/refactor-bot/src/benchmark/summarizePerformance.ts
 create mode 100644 packages/refactor-bot/src/benchmark/summarizeRefactorResult.ts
 create mode 100644 packages/refactor-bot/src/evaluate/evaluateFileChanges.ts
 create mode 100644 packages/refactor-bot/src/evaluate/evaluateFileScore.ts
 create mode 100644 packages/refactor-bot/src/evaluate/evaluateRefactorResult.ts
 create mode 100644 packages/refactor-bot/src/evaluate/evaluateUnchangedFile.ts
 create mode 100644 packages/refactor-bot/src/evaluate/extractRequirements.ts
 create mode 100644 packages/refactor-bot/src/evaluate/utils/avg.ts
 create mode 100644 packages/refactor-bot/src/evaluate/utils/sum.ts
 create mode 100644 packages/refactor-bot/src/git/gitShowFileCommitSummary.ts
 create mode 100644 packages/refactor-bot/src/prompt-formatters/formatBulletList.ts
 create mode 100644 packages/refactor-bot/src/prompt-formatters/formatFencedCodeBlock.ts
 create mode 100644 packages/refactor-bot/src/prompt-formatters/formatFileContents.ts
 create mode 100644 packages/refactor-bot/src/prompt-formatters/formatFileDiff.ts
 create mode 100644 packages/refactor-bot/src/prompt-formatters/formatOptional.ts
 create mode 100644 packages/refactor-bot/src/prompt-formatters/formatZodError.ts
 create mode 100644 packages/refactor-bot/src/refactor/collectLlmUsage.ts
 create mode 100644 packages/refactor-bot/src/refactor/filesToEdit.ts
 create mode 100644 packages/refactor-bot/src/refactor/findRefactorStateLocation.ts
 create mode 100644 packages/refactor-bot/src/response-parsers/parseFencedCodeBlocks.test.ts
 create mode 100644 packages/refactor-bot/src/response-parsers/parseFencedCodeBlocks.ts
 create mode 100644 packages/refactor-bot/src/response-parsers/parseJsonResponse.test.ts
 create mode 100644 packages/refactor-bot/src/response-parsers/parseJsonResponse.ts
 create mode 100644 packages/refactor-bot/src/utils/isFileNotFoundError.ts
 create mode 100644 packages/refactor-bot/src/utils/parseJsonSchema.ts

diff --git a/.changeset/witty-singers-sort.md b/.changeset/witty-singers-sort.md
new file mode 100644
index 0000000..f58b37a
--- /dev/null
+++ b/.changeset/witty-singers-sort.md
@@ -0,0 +1,56 @@
+---
+'refactor-bot': patch
+---
+
+feat: evaluate refactor outcomes using LLM to make decision of whether file edit
+should be accepted or discarded
+
+This is a big change which adds extra steps to the refactor process. Every time
+an LLM produces a file edit - we will pass that edit through evaluation
+algorithm to asses whether it should be accepted or discarded. Previously, this
+logic was only affected by the existence or absence of eslint errors. This will
+make the final result higher quality and more reliable.
+
+The new behavior can be disabled by setting `evaluate: false` in the `goal.md`
+file.
+
+In addition to that, this change also adds a new CLI command for internal use
+which allows us to compare results of multiple refactor runs. This is useful for
+benchmarking purposes.
+
+To run the benchmark, use the following command:
+
+```sh
+pnpm benchmark --config .refactor-bot/benchmarks/test-benchmark.yaml
+```
+
+Where the config:
+
+```yaml
+refactorConfig:
+    name: test-refactoring
+    ref: 8f1a3da55caeee3df75853042e57978c45513f18
+    budgetCents: 100
+    model: gpt-4-1106-preview
+    objective:
+        Replace all usages of `readFile` from `fs/promises` module with
+        `readFileSync` from `fs` module in
+        `packages/refactor-bot/src/refactor/planTasks.ts`,
+        `packages/refactor-bot/src/refactor/loadRefactors.ts` and
+        `packages/refactor-bot/src/refactor/discoverDependencies.ts`.
+
+numberOfRuns: 2
+
+variants:
+    - name: 'A'
+      ids: # ids of refactor runs to save mooney on
+          - VRixXEwC
+          - k0FmgQjU
+          - IpSOtP7d
+          - xqydSrSU
+    - name: 'B'
+```
+
+This will run multiple refactor runs and compare the results. At this moment no
+statistical analysis is performed as I'm not convinced we can reach statistical
+significance with the number of runs that also doesn't make you poor.
diff --git a/.gitignore b/.gitignore
index c270442..aed0928 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,9 @@
 node_modules
 .tsc-out
 src/test.ts
-.refactor-bot/refactors/*/state
 .DS_Store
-.refactor-bot/prompts/playground.md
 dist
+.refactor-bot/benchmarks/state
+.refactor-bot/refactors/*/state
+.refactor-bot/prompts/playground.md
+.refactor-bot/playground-cache
\ No newline at end of file
diff --git a/.refactor-bot/benchmarks/test-benchmark.yaml b/.refactor-bot/benchmarks/test-benchmark.yaml
new file mode 100644
index 0000000..1faa309
--- /dev/null
+++ b/.refactor-bot/benchmarks/test-benchmark.yaml
@@ -0,0 +1,22 @@
+refactorConfig:
+    name: test-refactoring
+    ref: 8f1a3da55caeee3df75853042e57978c45513f18
+    budgetCents: 100
+    model: gpt-4-1106-preview
+    objective:
+        Replace all usages of `readFile` from `fs/promises` module with
+        `readFileSync` from `fs` module in
+        `packages/refactor-bot/src/refactor/planTasks.ts`,
+        `packages/refactor-bot/src/refactor/loadRefactors.ts` and
+        `packages/refactor-bot/src/refactor/discoverDependencies.ts`.
+
+numberOfRuns: 2
+
+variants:
+    - name: 'A'
+      ids:
+          - VRixXEwC
+          - k0FmgQjU
+          - IpSOtP7d
+          - xqydSrSU
+    - name: 'B'
diff --git a/.refactor-bot/refactors/test-likely-failure/goal.md b/.refactor-bot/refactors/test-likely-failure/goal.md
index a8a3041..3c4d5fd 100644
--- a/.refactor-bot/refactors/test-likely-failure/goal.md
+++ b/.refactor-bot/refactors/test-likely-failure/goal.md
@@ -2,8 +2,8 @@
 # This is to test a likely failure during refactor
 ref: 8f1a3da55caeee3df75853042e57978c45513f18
 budgetCents: 100
-model: gpt-3.5-turbo
+model: gpt-3.5-turbo-1106
 ```
 
 Replace all usages of `readFile` from `fs/promises` module with `readFileSync`
-from `fs` module in `packages/refactor-bot/src/pipeline/dependencies.ts`.
+from `fs` module in `packages/refactor-bot/src/cache/dependencies.ts`.
diff --git a/README.md b/README.md
index 2fc8643..64e9666 100644
--- a/README.md
+++ b/README.md
@@ -80,20 +80,15 @@ Options:
 Performs a refactoring using Plan and Execute technique
 
 Options:
-  --help              Show help                                        [boolean]
-  --version           Show version number                              [boolean]
-  --name              Name of the refactoring to run                    [string]
-  --id                Unique id of the refactoring that was previously run but d
-                      idn't finish to start from last successful point  [string]
-  --save-to-cache     Whether to enable saving results to the cache, by default
-                      it's enabled                     [boolean] [default: true]
-  --enable-cache-for  Enable cache for specific steps only, can be useful if we
-                      want to disable cache for all other steps and replay them
-                                                                         [array]
-  --costs             Whether to print the total costs of OpenAI requests, by de
-                      fault it's disabled             [boolean] [default: false]
-  --performance       Whether to print performance metrics, by default it's disa
-                      bled                            [boolean] [default: false]
+  --help         Show help                                             [boolean]
+  --version      Show version number                                   [boolean]
+  --name         Name of the refactoring to run                         [string]
+  --id           Unique id of the refactoring that was previously run but didn't
+                  finish - use this to start from last successful step  [string]
+  --costs        Whether to print the total costs of OpenAI requests, by default
+                  it's disabled                       [boolean] [default: false]
+  --performance  Whether to print performance metrics, by default it's disabled
+                                                      [boolean] [default: false]
 ```
 
 At first it will create a file for you with description of the refactor. Open
@@ -183,7 +178,7 @@ further, but there is no way to measure how much we improved it. So the next
 step is to add a way to measure the quality of the refactor for benchmarking
 purposes.
 
--   [ ] ability to evaluate result of the refactor to benchmark the quality of
+-   [x] ability to evaluate result of the refactor to benchmark the quality of
         the refactor, so we can asses how different changes affect the quality
         and performance of the refactor-bot
 -   [ ] add Chain Of Thoughts to try to recover from failed attempts to fix
diff --git a/package.json b/package.json
index 0698a61..f54b5df 100644
--- a/package.json
+++ b/package.json
@@ -19,6 +19,7 @@
     },
     "type": "module",
     "scripts": {
+        "benchmark": "pnpm tsx packages/refactor-bot/src/benchmark/cli.ts",
         "refactor-bot-bundled": "node ./packages/refactor-bot/dist/bin/refactor-bot.mjs"
     },
     "dependencies": {},
diff --git a/packages/refactor-bot/src/benchmark/benchmark.ts b/packages/refactor-bot/src/benchmark/benchmark.ts
new file mode 100644
index 0000000..2c826fe
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/benchmark.ts
@@ -0,0 +1,102 @@
+import orderBy from 'lodash-es/orderBy';
+import { join } from 'path';
+import { from, lastValueFrom, mergeMap, toArray } from 'rxjs';
+
+import { createCachedPipeline } from '../cache/state';
+import type { CacheStateRef } from '../cache/types';
+import { logger } from '../logger/logger';
+import { randomText } from '../utils/randomText';
+import { loadBenchmarkConfig } from './loadBenchmarkConfig';
+import { reportBenchmarkSummary } from './reportBenchmarkSummary';
+import { runVariant } from './runVariant';
+import { summarizeRefactorResult } from './summarizeRefactorResult';
+
+export async function benchmark(opts: {
+    config: string;
+    id?: string;
+    // for debugging:
+    saveToCache?: boolean;
+    enableCacheFor?: string[];
+    disableCacheFor?: string[];
+}) {
+    const config = await loadBenchmarkConfig(opts.config);
+
+    const id = opts.id ?? randomText(8);
+
+    const runVariantsAndCompare = async (
+        input: typeof config,
+        ctx?: CacheStateRef
+    ) => {
+        const variantAtATime = Math.max(
+            1,
+            input.maxConcurrentRefactors / input.variants.length
+        );
+        const maxConcurrentRefactorsPerVariant = Math.max(
+            1,
+            input.maxConcurrentRefactors / variantAtATime
+        );
+
+        logger.debug('Running refactors for multiple variants', {
+            maxConcurrentRefactors: input.maxConcurrentRefactors,
+            variantAtATime,
+            maxConcurrentRefactorsPerVariant,
+        });
+
+        const results = await lastValueFrom(
+            from(input.variants).pipe(
+                mergeMap(async (variant) => {
+                    const { resultFilePaths } = await runVariant(
+                        {
+                            id,
+                            variant,
+                            numberOfRuns: input.numberOfRuns,
+                            evaluationConfig: input.evaluationConfig,
+                            maxConcurrentRefactors:
+                                maxConcurrentRefactorsPerVariant,
+                            refactorConfig: {
+                                ...input.refactorConfig,
+                                ...config.refactorConfig,
+                            },
+                        },
+                        ctx
+                    );
+
+                    return {
+                        variant: variant.name,
+                        resultFilePaths,
+                    };
+                }, variantAtATime),
+                toArray()
+            )
+        );
+
+        logger.debug('Finished running refactors for multiple variants', {
+            results,
+        });
+
+        const summaries = await Promise.all(
+            results.map(async (result) => ({
+                variant: result.variant,
+                summary: await summarizeRefactorResult({
+                    resultFilePaths: result.resultFilePaths,
+                }),
+            }))
+        );
+
+        const orderedSummaries = orderBy(summaries, ['variant']);
+
+        await reportBenchmarkSummary({
+            summaries: orderedSummaries,
+        });
+    };
+
+    const { execute } = createCachedPipeline({
+        location: join(`.refactor-bot/benchmarks/state`, id),
+        saveToCache: opts.saveToCache ?? true,
+        enableCacheFor: opts.enableCacheFor,
+        disableCacheFor: opts.disableCacheFor,
+        pipeline: runVariantsAndCompare,
+    });
+
+    return await execute(config);
+}
diff --git a/packages/refactor-bot/src/benchmark/benchmarkConfig.ts b/packages/refactor-bot/src/benchmark/benchmarkConfig.ts
new file mode 100644
index 0000000..47d9b0a
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/benchmarkConfig.ts
@@ -0,0 +1,61 @@
+import { z } from 'zod';
+
+import { modelsSchema } from '../chat-gpt/api';
+import { ensureHasTwoElements } from '../utils/hasOne';
+
+export const passthroughRefactorConfigSchema = z
+    .object({
+        name: z.string(),
+        model: modelsSchema,
+        objective: z.string(),
+    })
+    .passthrough();
+
+const partialRefactorConfigSchema = z
+    .object({
+        model: modelsSchema.optional(),
+        objective: z.string().optional(),
+    })
+    .passthrough();
+
+export const appVariantSchema = z.object({
+    name: z.string().regex(/^[a-z0-9-]+$/i),
+    ref: z.string().optional(),
+    repository: z.string().optional(),
+    ids: z.array(z.string()).optional(),
+    command: z
+        .array(z.string())
+        .nonempty()
+        .default(['pnpm', 'refactor-bot', 'refactor']),
+    refactorConfig: partialRefactorConfigSchema.optional(),
+});
+
+export const evaluationConfigSchema = z.object({
+    model: modelsSchema,
+    choices: z.number().default(3),
+});
+
+export const benchConfigSchema = z
+    .object({
+        variants: z
+            .array(appVariantSchema)
+            .transform((variants) => ensureHasTwoElements(variants)),
+        refactorConfig: passthroughRefactorConfigSchema,
+        evaluationConfig: evaluationConfigSchema.default({
+            model: 'gpt-4-1106-preview',
+            choices: 3,
+        }),
+        numberOfRuns: z.number().default(1),
+        maxConcurrentRefactors: z.number().default(4),
+    })
+    .transform((input, ctx) => {
+        const variants = new Set(input.variants.map((variant) => variant.name));
+        if (variants.size !== input.variants.length) {
+            ctx.addIssue({
+                code: 'custom',
+                message: 'Variants must have unique names',
+            });
+            return z.NEVER;
+        }
+        return input;
+    });
diff --git a/packages/refactor-bot/src/benchmark/cli.ts b/packages/refactor-bot/src/benchmark/cli.ts
new file mode 100644
index 0000000..b10bae3
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/cli.ts
@@ -0,0 +1,80 @@
+import type { ArgumentsCamelCase, Argv, CommandModule } from 'yargs';
+import yargs from 'yargs';
+
+import { line } from '../text/line';
+
+const builder = (yargs: Argv) =>
+    yargs
+        .option('id', {
+            type: 'string',
+            describe: line`
+                Unique id of the benchmark run to identify cache directory
+            `,
+        })
+        .option('config', {
+            type: 'string',
+            describe: line`
+                Path to the config yaml file containing benchmark configuration
+            `,
+            demandOption: true,
+        })
+        .option('save-to-cache', {
+            type: 'boolean',
+            describe: line`
+                Whether to enable saving results to the cache, by default
+                it's enabled.
+            `,
+            default: true,
+            hidden: true,
+        })
+        .option('enable-cache-for', {
+            type: 'string',
+            array: true,
+            describe: line`
+                Disable cache for specific steps - you can specify the name
+                of the step or a name followed by a hash of the cache entry.
+                This for debugging purposes only.
+            `,
+            hidden: true,
+        })
+        .option('disable-cache-for', {
+            type: 'string',
+            array: true,
+            describe: line`
+                Disable cache for specific steps - you can specify the name
+                of the step or a name followed by a hash of the cache entry.
+                This for debugging purposes only.
+            `,
+            hidden: true,
+        });
+
+type Args = {
+    config: string;
+};
+
+const benchmarkCommand = {
+    command: 'benchmark',
+    describe: line`
+        Performs refactoring using different versions of the refactor bot then
+        evaluates the results and compares them.
+    `,
+    builder,
+    handler: async (opts: ArgumentsCamelCase<Args>) => {
+        await import('dotenv').then((m) =>
+            m.config({
+                override: true,
+            })
+        );
+        const { cliHandler } = await import('./cliHandler');
+        await cliHandler(opts);
+    },
+} satisfies CommandModule<Record<never, never>, Args>;
+
+const opts = await builder(yargs(process.argv.slice(2)))
+    .usage(benchmarkCommand.describe)
+    .parseAsync();
+
+await benchmarkCommand.handler(opts).catch((err) => {
+    console.error(err);
+    process.exitCode = 1;
+});
diff --git a/packages/refactor-bot/src/benchmark/cliHandler.ts b/packages/refactor-bot/src/benchmark/cliHandler.ts
new file mode 100644
index 0000000..c5a0a92
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/cliHandler.ts
@@ -0,0 +1,12 @@
+import { benchmark } from './benchmark';
+
+export async function cliHandler(opts: {
+    config: string;
+    id?: string;
+    // for debugging:
+    saveToCache?: boolean;
+    enableCacheFor?: string[];
+    disableCacheFor?: string[];
+}) {
+    await benchmark(opts);
+}
diff --git a/packages/refactor-bot/src/benchmark/example-config.yaml b/packages/refactor-bot/src/benchmark/example-config.yaml
new file mode 100644
index 0000000..afb87c0
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/example-config.yaml
@@ -0,0 +1,22 @@
+refactorConfig:
+    name: 'refactor-this-or-that'
+    objective: 'Refactor this or that'
+    model: 'gpt-4-1106-preview'
+
+evaluationConfig:
+    model: 'gpt-4-1106-preview'
+    choices: 3
+
+numberOfRuns: 3
+
+variants:
+    - name: 'A'
+      ref: 'HEAD'
+
+    - name: 'B'
+      ref: 'HEAD'
+      command:
+          - pnpm
+          - refactor-bot
+          - refactor
+          - --name refactor-this-or-that
diff --git a/packages/refactor-bot/src/benchmark/loadBenchmarkConfig.ts b/packages/refactor-bot/src/benchmark/loadBenchmarkConfig.ts
new file mode 100644
index 0000000..43483a7
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/loadBenchmarkConfig.ts
@@ -0,0 +1,23 @@
+import { readFile } from 'fs/promises';
+import { load } from 'js-yaml';
+
+import { ConfigurationError } from '../errors/configurationError';
+import { isFileNotFoundError } from '../utils/isFileNotFoundError';
+import { benchConfigSchema } from './benchmarkConfig';
+
+export async function loadBenchmarkConfig(filePath: string) {
+    try {
+        const text = await readFile(filePath, 'utf-8');
+        const data = load(text);
+        const parsed = benchConfigSchema.parse(data);
+        return parsed;
+    } catch (err) {
+        if (isFileNotFoundError(err)) {
+            throw new ConfigurationError(`Config file not found`, {
+                cause: err,
+            });
+        }
+
+        throw err;
+    }
+}
diff --git a/packages/refactor-bot/src/benchmark/loadRefactorResult.ts b/packages/refactor-bot/src/benchmark/loadRefactorResult.ts
new file mode 100644
index 0000000..d23f1f5
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/loadRefactorResult.ts
@@ -0,0 +1,36 @@
+import { readFile } from 'fs/promises';
+import { load } from 'js-yaml';
+import { z } from 'zod';
+
+import { ConfigurationError } from '../errors/configurationError';
+import { line } from '../text/line';
+import { isFileNotFoundError } from '../utils/isFileNotFoundError';
+import { refactorResultSchema } from './refactorResultSchema';
+
+export async function loadRefactorResult(opts: { resultFilePath: string }) {
+    try {
+        return refactorResultSchema.parse(
+            load(await readFile(opts.resultFilePath, 'utf-8'))
+        );
+    } catch (err) {
+        if (isFileNotFoundError(err)) {
+            return undefined;
+        }
+
+        if (err instanceof z.ZodError) {
+            throw new ConfigurationError(
+                line`
+                    Failed to parse result of the refactor at 
+                    "${opts.resultFilePath}" - when the schema changes
+                    re-evaluating the refactor results
+                    will not be possible
+                `,
+                {
+                    cause: err,
+                }
+            );
+        }
+
+        throw err;
+    }
+}
diff --git a/packages/refactor-bot/src/benchmark/refactorResultSchema.ts b/packages/refactor-bot/src/benchmark/refactorResultSchema.ts
new file mode 100644
index 0000000..9178a1d
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/refactorResultSchema.ts
@@ -0,0 +1,36 @@
+import { z } from 'zod';
+
+import { checkoutSandboxResultSchema } from '../refactor/checkoutSandbox';
+import { planFilesResultSchema } from '../refactor/planFiles';
+import {
+    llmUsageEntrySchema,
+    refactorFilesResultSchema,
+} from '../refactor/types';
+
+/**
+ * @note this is a copy of the refactor result schema that is used in the
+ * ../refactor/resultsCollector.ts file - we maintain a second copy here so
+ * that we can introduce breaking changes to the refactor result schema and
+ * "migrate" the old format to the new format using this schema here, if we
+ * have to.
+ */
+export const refactorResultSchema = z
+    .object({
+        objective: z.string(),
+        status: z.enum(['success', 'failure']),
+        error: z.record(z.unknown()).optional(),
+    })
+    .merge(checkoutSandboxResultSchema)
+    .merge(refactorFilesResultSchema)
+    .augment({
+        planFilesResults: z.array(planFilesResultSchema).optional(),
+        usage: z.array(llmUsageEntrySchema),
+        performance: z.object({
+            totalDurationMs: z.number(),
+            durationMsByStep: z.record(
+                z.object({
+                    durationMs: z.number(),
+                })
+            ),
+        }),
+    });
diff --git a/packages/refactor-bot/src/benchmark/reportBenchmarkSummary.ts b/packages/refactor-bot/src/benchmark/reportBenchmarkSummary.ts
new file mode 100644
index 0000000..de9b90c
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/reportBenchmarkSummary.ts
@@ -0,0 +1,68 @@
+import assert from 'assert';
+import { formatWithOptions } from 'util';
+
+import { markdown, printMarkdown } from '../markdown/markdown';
+import { format } from '../text/format';
+import { lowerCamelCaseToKebabCase } from '../utils/lowerCamelCaseToKebabCase';
+import type { summarizeRefactorResult } from './summarizeRefactorResult';
+
+function formatColumn(_: string, column: unknown) {
+    if (typeof column === 'number') {
+        return column.toFixed(2);
+    }
+
+    return formatWithOptions({ colors: true }, '%O', column);
+}
+
+export async function reportBenchmarkSummary(opts: {
+    summaries: Array<{
+        variant: string;
+        summary: Awaited<ReturnType<typeof summarizeRefactorResult>>;
+    }>;
+}) {
+    const keys = opts.summaries.map(({ summary }) => Object.keys(summary))[0];
+    assert(keys !== undefined);
+    assert(keys.length > 0);
+
+    const header = [
+        'variant',
+        ...keys.map((key) => lowerCamelCaseToKebabCase(key)),
+    ].join(' | ');
+    const header2 = ['---', '---', ...keys.map(() => '---')].join(' | ');
+    const table = opts.summaries
+        .map(({ variant, summary }) =>
+            [
+                variant,
+                ...keys.map((key) =>
+                    formatColumn(key, (summary as Record<string, unknown>)[key])
+                ),
+            ].join(' | ')
+        )
+        .join('\n');
+
+    await printMarkdown(
+        format(
+            markdown`
+                # Benchmark results
+
+                %results%
+
+                All columns averaged over \`numberOfRuns\`.
+
+                \`score\` - outcome based metric from 0 to 1 where 1 represents
+                an outcome where every explicit requirement is satisfied and 0
+                represents failed refactor
+
+                \`acceptedRatio\` - ratio of number of files that were accepted
+                as a result of the refactor to the total number of files that
+                were changed
+
+                \`wastedTokensRatio\` - number of tokens that were exchanged
+                with LLM, which produced an outcome that was later discarded
+            `,
+            {
+                results: [header, header2, table].join('\n'),
+            }
+        )
+    );
+}
diff --git a/packages/refactor-bot/src/benchmark/runVariant.ts b/packages/refactor-bot/src/benchmark/runVariant.ts
new file mode 100644
index 0000000..2ac1439
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/runVariant.ts
@@ -0,0 +1,204 @@
+import assert from 'assert';
+import dedent from 'dedent';
+import { mkdir, writeFile } from 'fs/promises';
+import { dump } from 'js-yaml';
+import { join } from 'path';
+import { lastValueFrom, mergeMap, range, toArray } from 'rxjs';
+import { z } from 'zod';
+
+import { makeCachedFunction } from '../cache/makeCachedFunction';
+import { spawnResult } from '../child-process/spawnResult';
+import { findRepositoryRoot } from '../file-system/findRepositoryRoot';
+import { logger } from '../logger/logger';
+import { checkoutSandbox } from '../refactor/checkoutSandbox';
+import { randomText } from '../utils/randomText';
+import {
+    appVariantSchema,
+    evaluationConfigSchema,
+    passthroughRefactorConfigSchema,
+} from './benchmarkConfig';
+
+export const runVariantSchema = z.object({
+    id: z.string(),
+    variant: appVariantSchema,
+    refactorConfig: passthroughRefactorConfigSchema,
+    evaluationConfig: evaluationConfigSchema,
+    numberOfRuns: z.number(),
+    maxConcurrentRefactors: z.number(),
+});
+
+export const runVariant = makeCachedFunction({
+    name: 'run-variant',
+    inputSchema: runVariantSchema,
+    resultSchema: z.object({
+        resultFilePaths: z.array(z.string()),
+    }),
+    transform: async (input, ctx) => {
+        const repoRoot = await findRepositoryRoot();
+
+        const newRuns = input.numberOfRuns - (input.variant.ids?.length ?? 0);
+
+        if (newRuns === 0) {
+            logger.debug('No new runs to start', {
+                numberOfRunsRequired: input.numberOfRuns,
+                numberOfPreEvaluatedRuns: input.variant.ids?.length ?? 0,
+                maxConcurrentRefactors: input.maxConcurrentRefactors,
+            });
+
+            const resultFilePaths = (input.variant.ids ?? []).map(
+                (refactorId) =>
+                    join(
+                        repoRoot,
+                        '.refactor-bot',
+                        'refactors',
+                        input.refactorConfig.name,
+                        'state',
+                        refactorId,
+                        'result.yaml'
+                    )
+            );
+
+            return {
+                resultFilePaths,
+            };
+        }
+
+        const cliSandbox = await checkoutSandbox(
+            {
+                name: ['bench', input.variant.name].join('-'),
+                id: input.id,
+                ref: input.variant.ref,
+                repository: input.variant.repository,
+                allowDirtyWorkingTree: true,
+            },
+            ctx
+        );
+
+        const { objective, ...rest } = input.refactorConfig;
+
+        await mkdir(
+            join(
+                cliSandbox.sandboxDirectoryPath,
+                '.refactor-bot',
+                'refactors',
+                input.refactorConfig.name
+            ),
+            { recursive: true }
+        );
+
+        logger.debug('Created sandbox for the refactor CLI at', {
+            location: cliSandbox.sandboxDirectoryPath,
+        });
+
+        const goalMd = join(
+            cliSandbox.sandboxDirectoryPath,
+            '.refactor-bot',
+            'refactors',
+            input.refactorConfig.name,
+            `goal.md`
+        );
+
+        await writeFile(
+            goalMd,
+            dedent`
+                \`\`\`yaml
+                ${dump(rest)}
+                \`\`\`
+
+                ${objective}
+            `
+        );
+
+        logger.debug('Written refactor config with an objective at', {
+            location: goalMd,
+        });
+
+        logger.debug('Starting refactoring process', {
+            numberOfRuns: input.numberOfRuns,
+            numberOfPreEvaluatedRuns: input.variant.ids?.length ?? 0,
+            maxConcurrentRefactors: input.maxConcurrentRefactors,
+        });
+
+        const results = await lastValueFrom(
+            range(0, newRuns).pipe(
+                mergeMap(async () => {
+                    const refactorId = randomText(8);
+
+                    const result = await spawnResult(
+                        input.variant.command[0],
+                        input.variant.command
+                            .slice(1)
+                            .concat([
+                                '--name',
+                                input.refactorConfig.name,
+                                '--id',
+                                refactorId,
+                            ]),
+                        {
+                            cwd: cliSandbox.sandboxDirectoryPath,
+                            output: [],
+                            exitCodes: 'any',
+                            stdio: 'inherit',
+                            env: {
+                                ...process.env,
+                                LOG_RELATIVE_TO_CWD: process.cwd(),
+                                CACHE_ROOT: repoRoot,
+                            },
+                        }
+                    );
+
+                    logger.debug('Process finished', {
+                        command: input.variant.command.join(' '),
+                        pid: result.pid,
+                        ...(result.error && {
+                            error: result.error,
+                        }),
+                        ...(typeof result.status === 'number' && {
+                            status: result.status,
+                        }),
+                        ...(typeof result.signal === 'string' && {
+                            signal: result.signal,
+                        }),
+                    });
+
+                    assert(!result.signal, 'Failed to run refactor command');
+
+                    return {
+                        resultFilePath: join(
+                            repoRoot,
+                            '.refactor-bot',
+                            'refactors',
+                            input.refactorConfig.name,
+                            'state',
+                            refactorId,
+                            'result.yaml'
+                        ),
+                    };
+                }, input.maxConcurrentRefactors),
+                toArray()
+            )
+        );
+
+        const resultFilePaths = (input.variant.ids ?? [])
+            .map((refactorId) =>
+                join(
+                    repoRoot,
+                    '.refactor-bot',
+                    'refactors',
+                    input.refactorConfig.name,
+                    'state',
+                    refactorId,
+                    'result.yaml'
+                )
+            )
+            .concat(results.map((result) => result.resultFilePath));
+
+        logger.debug('Result file paths below', {
+            resultFilePaths,
+        });
+
+        return {
+            resultFilePaths,
+        };
+    },
+});
diff --git a/packages/refactor-bot/src/benchmark/summarizeCosts.ts b/packages/refactor-bot/src/benchmark/summarizeCosts.ts
new file mode 100644
index 0000000..45a3aee
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/summarizeCosts.ts
@@ -0,0 +1,40 @@
+import type { z } from 'zod';
+
+import { avg } from '../evaluate/utils/avg';
+import { sum } from '../evaluate/utils/sum';
+import { summarizeLlmUsageTokens } from '../refactor/collectLlmUsage';
+import type { refactorResultSchema } from './refactorResultSchema';
+
+export function summarizeCosts(opts: {
+    results: Array<z.output<typeof refactorResultSchema>>;
+}) {
+    const summaries = opts.results.map((result) => {
+        const { totalTokens } = summarizeLlmUsageTokens(result);
+
+        const wastedTokens = sum(
+            result.discarded.map((result) =>
+                sum(result.usage.map((usage) => usage.usage.totalTokens))
+            )
+        );
+
+        return {
+            totalTokens,
+            wastedTokensRatio: wastedTokens / totalTokens,
+        };
+    });
+
+    return {
+        /**
+         * Average number of tokens used in the refactor
+         */
+        totalTokens: avg(summaries.map((summary) => summary.totalTokens)),
+
+        /**
+         * Average ratio of tokens used to produce results which were discarded
+         * during the refactor
+         */
+        wastedTokensRatio: avg(
+            summaries.map((summary) => summary.wastedTokensRatio)
+        ),
+    };
+}
diff --git a/packages/refactor-bot/src/benchmark/summarizeOutcomeScore.ts b/packages/refactor-bot/src/benchmark/summarizeOutcomeScore.ts
new file mode 100644
index 0000000..700afb2
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/summarizeOutcomeScore.ts
@@ -0,0 +1,56 @@
+import type { z } from 'zod';
+
+import { avg } from '../evaluate/utils/avg';
+import { summarizeRefactorFilesResult } from '../refactor/types';
+import type { refactorResultSchema } from './refactorResultSchema';
+
+export function summarizeOutcomeScore(opts: {
+    results: Array<z.output<typeof refactorResultSchema>>;
+}) {
+    const summaries = opts.results.map((result) => {
+        const { accepted, discarded } = summarizeRefactorFilesResult(result);
+
+        const acceptedFileNames = Object.keys(accepted.resultsByFilePaths);
+        const discardedFileNames = Object.keys(discarded.resultsByFilePaths);
+
+        if (acceptedFileNames.length === 0 && discardedFileNames.length === 0) {
+            return {
+                score: 0,
+                acceptedRatio: 0,
+            };
+        }
+
+        const score = avg([
+            ...Object.values(accepted.resultsByFilePaths).map((results) =>
+                avg(results.map((result) => result.evaluation?.score ?? 0))
+            ),
+            ...Object.values(discarded.resultsByFilePaths).map((results) =>
+                avg(results.map((result) => result.evaluation?.score ?? 0))
+            ),
+        ]);
+
+        return {
+            score,
+            acceptedRatio:
+                acceptedFileNames.length /
+                (acceptedFileNames.length + discardedFileNames.length),
+        };
+    });
+
+    return {
+        /**
+         * Average evaluation score for all refactors
+         *
+         * This score reflects whether the refactoring objective is fully
+         * complete or not. Value of 1 means that all requirements were met,
+         * value of 0 means that none of the requirements were met.
+         */
+        score: avg(summaries.map((summary) => summary.score)),
+
+        /**
+         * Average ratio of accepted file changes to the total number of
+         * files we attempted to refactor
+         */
+        acceptedRatio: avg(summaries.map((summary) => summary.acceptedRatio)),
+    };
+}
diff --git a/packages/refactor-bot/src/benchmark/summarizePerformance.ts b/packages/refactor-bot/src/benchmark/summarizePerformance.ts
new file mode 100644
index 0000000..e9960ca
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/summarizePerformance.ts
@@ -0,0 +1,21 @@
+import type { z } from 'zod';
+
+import { avg } from '../evaluate/utils/avg';
+import type { refactorResultSchema } from './refactorResultSchema';
+
+export function summarizePerformance(opts: {
+    results: Array<z.output<typeof refactorResultSchema>>;
+}) {
+    const summaries = opts.results.map((result) => {
+        return {
+            totalDurationMs: result.performance.totalDurationMs,
+        };
+    });
+
+    return {
+        /**
+         * Average duration of the refactor
+         */
+        durationMs: avg(summaries.map((summary) => summary.totalDurationMs)),
+    };
+}
diff --git a/packages/refactor-bot/src/benchmark/summarizeRefactorResult.ts b/packages/refactor-bot/src/benchmark/summarizeRefactorResult.ts
new file mode 100644
index 0000000..52becb6
--- /dev/null
+++ b/packages/refactor-bot/src/benchmark/summarizeRefactorResult.ts
@@ -0,0 +1,49 @@
+import { ConfigurationError } from '../errors/configurationError';
+import { line } from '../text/line';
+import { loadRefactorResult } from './loadRefactorResult';
+import { summarizeCosts } from './summarizeCosts';
+import { summarizeOutcomeScore } from './summarizeOutcomeScore';
+import { summarizePerformance } from './summarizePerformance';
+
+export async function summarizeRefactorResult(opts: {
+    resultFilePaths: string[];
+}) {
+    const results = await Promise.all(
+        opts.resultFilePaths.map((resultFilePath) =>
+            loadRefactorResult({
+                resultFilePath,
+            })
+        )
+    );
+
+    const loadedResults = results.flatMap((result) => (result ? [result] : []));
+
+    if (loadedResults.length === 0) {
+        throw new ConfigurationError(
+            line`
+                No parsable refactor results found, this could happen when
+                refactoring has failed for all attempts of the given variant -
+                double check that the configuration is correct
+            `
+        );
+    }
+
+    const outcomeScore = summarizeOutcomeScore({
+        results: loadedResults,
+    });
+
+    const costs = summarizeCosts({
+        results: loadedResults,
+    });
+
+    const performance = summarizePerformance({
+        results: loadedResults,
+    });
+
+    return {
+        numberOfRuns: loadedResults.length,
+        ...outcomeScore,
+        ...costs,
+        ...performance,
+    };
+}
diff --git a/packages/refactor-bot/src/bootstrap.ts b/packages/refactor-bot/src/bootstrap.ts
index 36a889e..1556cf6 100644
--- a/packages/refactor-bot/src/bootstrap.ts
+++ b/packages/refactor-bot/src/bootstrap.ts
@@ -1,8 +1,16 @@
 export async function bootstrap(load: () => Promise<unknown>) {
+    if (process.argv.includes('--inspect')) {
+        await import('inspector').then((inspector) => {
+            console.log('Attach to PID', process.pid);
+            inspector.open(undefined, undefined, true);
+            // eslint-disable-next-line no-debugger
+            debugger;
+        });
+    }
     await import('dotenv').then((dotenv) =>
         dotenv.config({
             override: true,
         })
     );
-    await load();
+    return await load();
 }
diff --git a/packages/refactor-bot/src/cache/cache.ts b/packages/refactor-bot/src/cache/cache.ts
index e6d8596..e98b5ef 100644
--- a/packages/refactor-bot/src/cache/cache.ts
+++ b/packages/refactor-bot/src/cache/cache.ts
@@ -56,7 +56,19 @@ export async function lookupEventsInCache<
 
     if (
         opts.state.enableCacheFor &&
-        !opts.state.enableCacheFor.includes(opts.name)
+        !opts.state.enableCacheFor.includes(opts.name) &&
+        !opts.state.enableCacheFor.includes(basename(opts.key))
+    ) {
+        return {
+            foundLocation: undefined,
+            foundEvents: undefined,
+        };
+    }
+
+    if (
+        opts.state.disableCacheFor &&
+        (opts.state.disableCacheFor.includes(opts.name) ||
+            opts.state.disableCacheFor.includes(basename(opts.key)))
     ) {
         return {
             foundLocation: undefined,
@@ -73,14 +85,21 @@ export async function lookupEventsInCache<
     }
 
     try {
-        const cwd = opts.location;
+        const cwd = opts.state.internal_unboundCacheLookup
+            ? opts.state.location ?? opts.location
+            : opts.location;
         /**
          * @note to simplify mocking there is only fg dependency, but
          * what we really needed here is fs.stats, but let's use the fg
          * so we don't have to mock fs.stats and sync it with the fg.
          */
         const entries = await opts.state.deps.fg(
-            [`${basename(opts.key)}.yaml`],
+            opts.state.internal_unboundCacheLookup
+                ? [
+                      `${basename(opts.key)}.yaml`,
+                      `**/${basename(opts.key)}.yaml`,
+                  ]
+                : [`${basename(opts.key)}.yaml`],
             {
                 cwd,
                 ignore: [],
@@ -156,6 +175,9 @@ export async function saveEventsToCache<
 }
 
 export async function cleanCache(
+    opts: {
+        cleanRoot?: boolean;
+    },
     ctx: { location?: string },
     deps = defaultDeps
 ) {
@@ -183,7 +205,9 @@ export async function cleanCache(
         return;
     }
 
-    logger.trace('Cleaning', relative(process.cwd(), location));
+    logger.trace('Cleaning', {
+        location,
+    });
 
     /**
      * Filter out any directory or file that appears in the log
@@ -218,17 +242,17 @@ export async function cleanCache(
         ),
     ];
 
-    const filesAndDirs = await fg([`*.yaml`, `*`, ...include], {
-        cwd: location,
-        ignore,
-        onlyFiles: false,
-    });
+    const filesAndDirs = await fg(
+        [...(opts.cleanRoot ? [`*.yaml`, `*`] : []), ...include],
+        {
+            cwd: location,
+            ignore,
+            onlyFiles: false,
+        }
+    );
 
     for (const entry of filesAndDirs) {
-        logger.trace(
-            `Deleting`,
-            relative(process.cwd(), join(location, entry))
-        );
+        logger.trace(`Deleting`, join(location, entry));
 
         if (entry.endsWith('.yaml')) {
             await unlink(join(location, entry));
diff --git a/packages/refactor-bot/src/cache/log.ts b/packages/refactor-bot/src/cache/log.ts
index 16a57cd..5ea8e0e 100644
--- a/packages/refactor-bot/src/cache/log.ts
+++ b/packages/refactor-bot/src/cache/log.ts
@@ -1,5 +1,3 @@
-import { relative } from 'path';
-
 import { CycleDetectedError } from '../errors/cycleDetectedError';
 import { type CacheState, getPipelineState } from './state';
 import type { CacheStateRef } from './types';
@@ -38,9 +36,8 @@ export function addToExecutionLog(opts: { state: CacheState; key: string }) {
 export function logExecutionLog(ctx: CacheStateRef) {
     const state = getPipelineState(ctx);
     if (state) {
-        state.deps.logger.debug(
-            `Execution log:`,
-            state.log.map((entry) => relative(process.cwd(), entry))
-        );
+        state.deps.logger.debug(`Execution log:`, {
+            executionLog: state.log.map((entry) => entry),
+        });
     }
 }
diff --git a/packages/refactor-bot/src/cache/makeCachedFunction.ts b/packages/refactor-bot/src/cache/makeCachedFunction.ts
index f1d052c..756849f 100644
--- a/packages/refactor-bot/src/cache/makeCachedFunction.ts
+++ b/packages/refactor-bot/src/cache/makeCachedFunction.ts
@@ -51,6 +51,8 @@ export function makeCachedFunction<
             },
             ctx: CacheStateRef & {
                 dispatch: (typeof defaultDeps)['dispatch'];
+                actions: (typeof defaultDeps)['actions'];
+                skipSavingToCache: (err: unknown) => void;
             }
         ) => Promise<TypeOf<OutputSchema>>;
     },
diff --git a/packages/refactor-bot/src/cache/makeCachedObservable.test.ts b/packages/refactor-bot/src/cache/makeCachedObservable.test.ts
index b361048..713def9 100644
--- a/packages/refactor-bot/src/cache/makeCachedObservable.test.ts
+++ b/packages/refactor-bot/src/cache/makeCachedObservable.test.ts
@@ -102,7 +102,7 @@ const setup = () => {
             return makeCachedObservableToTest(...params);
         }) satisfies typeof makeCachedObservableToTest,
         cleanCache: ((...params) => {
-            params[1] = deps;
+            params[2] = deps;
             return cleanCacheToTest(...params);
         }) satisfies typeof cleanCache,
     };
@@ -922,7 +922,7 @@ it('should delete old cached files when clean is called', async () => {
         'add-660e.yaml': expect.arrayContaining([addResult({ value: 1 })]),
     });
 
-    await cleanCache(ctx);
+    await cleanCache({ cleanRoot: true }, ctx);
 
     expect(Object.fromEntries(files.entries())).toEqual({
         'add-10bb.yaml': expect.arrayContaining([addResult({ value: 3 })]),
@@ -955,7 +955,7 @@ it('should clean only on executed levels', async () => {
         'multiply-26e7'
     );
 
-    await cleanCache(ctx, deps);
+    await cleanCache({ cleanRoot: true }, ctx, deps);
 
     /**
      * @note note that files with xxxx hash deleted but not files with xxyy hash
@@ -974,7 +974,7 @@ it('should clean only on executed levels', async () => {
         'sub-pipe-51ea/multiply-b956'
     );
 
-    await cleanCache(ctx, deps);
+    await cleanCache({ cleanRoot: true }, ctx, deps);
 
     /**
      * @note note that files with xxyy hash deleted now
diff --git a/packages/refactor-bot/src/cache/makeCachedObservable.ts b/packages/refactor-bot/src/cache/makeCachedObservable.ts
index 9d57d5f..28c3f92 100644
--- a/packages/refactor-bot/src/cache/makeCachedObservable.ts
+++ b/packages/refactor-bot/src/cache/makeCachedObservable.ts
@@ -93,6 +93,8 @@ export function makeCachedObservable<
             },
             ctx: CacheStateRef & {
                 dispatch: (action: AnyAction) => void;
+                actions: (typeof defaultDeps)['actions'];
+                skipSavingToCache: (err: unknown) => void;
             }
         ) => Observable<TypeOf<EventSchema> | AnyAction>;
     },
@@ -235,7 +237,7 @@ export function makeCachedObservable<
                     `,
                     {
                         ...(location && {
-                            location: relative(process.cwd(), location),
+                            location,
                         }),
                     }
                 );
@@ -252,12 +254,21 @@ export function makeCachedObservable<
                     key,
                 });
 
+                let savingToCacheWasSkippedByTheFunctionCall = false;
+
+                const skipSavingToCache = () => {
+                    deps.logger.trace(`Saving events has been skipped`, {
+                        location: location || '.',
+                    });
+                    savingToCacheWasSkippedByTheFunctionCall = true;
+                };
+
                 const resultingEvents = concat(
                     of(
                         executionStarted({
                             name,
                             key,
-                            input,
+                            input: validatedInput,
                         })
                     ),
                     opts.factory(validatedInput, {
@@ -266,6 +277,8 @@ export function makeCachedObservable<
                             location: join(location, basename(key)),
                         }),
                         dispatch: recordAndDispatch,
+                        actions: deps.actions,
+                        skipSavingToCache,
                     }) as Observable<AnyAction>,
                     of(
                         executionSuccess({
@@ -291,6 +304,10 @@ export function makeCachedObservable<
                 );
 
                 const saveResultingEvents = defer(async () => {
+                    if (savingToCacheWasSkippedByTheFunctionCall) {
+                        return;
+                    }
+
                     await saveEventsToCache({
                         events,
                         eventsSchema,
diff --git a/packages/refactor-bot/src/cache/persistence.ts b/packages/refactor-bot/src/cache/persistence.ts
index 52c5d2e..084a7b8 100644
--- a/packages/refactor-bot/src/cache/persistence.ts
+++ b/packages/refactor-bot/src/cache/persistence.ts
@@ -1,4 +1,4 @@
-import { dirname, relative } from 'path';
+import { dirname } from 'path';
 import type { z, ZodType } from 'zod';
 import { type TypeOf } from 'zod';
 
@@ -54,9 +54,9 @@ export const saveEvents = async <
 ) => {
     const { logger } = deps;
 
-    logger.trace(
-        `Saving events to "${relative(process.cwd(), opts.location)}"`
-    );
+    logger.trace(`Saving events`, {
+        location: opts.location,
+    });
 
     await save(
         {
@@ -79,9 +79,9 @@ export const loadEvents = async <
     },
     deps = defaultDeps
 ) => {
-    deps.logger.trace(
-        `Loading events from "${relative(process.cwd(), opts.location)}"`
-    );
+    deps.logger.trace(`Loading events`, {
+        location: opts.location,
+    });
 
     const events = await load(
         {
diff --git a/packages/refactor-bot/src/cache/state.ts b/packages/refactor-bot/src/cache/state.ts
index c7ba8ed..107a258 100644
--- a/packages/refactor-bot/src/cache/state.ts
+++ b/packages/refactor-bot/src/cache/state.ts
@@ -1,5 +1,4 @@
 import { AsyncLocalStorage } from 'async_hooks';
-import { relative } from 'path';
 
 import type { AnyAction } from '../event-bus';
 import { line } from '../text/line';
@@ -40,12 +39,27 @@ export type CacheState = {
     readonly saveToCache: boolean;
 
     /**
-     * Whether to load the cache for the given function or not, this allows us
-     * to disable the cache for certain functions only for debugging and testing
-     * purposes.
+     * Enable cache for specific steps - you can specify the name of the step
+     * or a name followed by a hash of the cache entry.
      */
     readonly enableCacheFor?: string[];
 
+    /**
+     * Disable cache for specific steps - you can specify the name of the step
+     * or a name followed by a hash of the cache entry.
+     */
+    readonly disableCacheFor?: string[];
+
+    /**
+     * For debugging purposes, if set to `true` the pipeline will ignore the
+     * hierarchy of the pipeline and will attempt to look for any cached
+     * entries in the cache store, regardless of the parent function.
+     *
+     * This is useful when we want to refactor function names or introduce new
+     * hierarchy, but also do not want to loose the cache state.
+     */
+    readonly internal_unboundCacheLookup?: boolean;
+
     /**
      * If set to `true` the pipeline will stop executing and throw
      * an error.
@@ -65,6 +79,7 @@ export const initializeCacheState = (
         saveToCache?: boolean;
         saveInput?: boolean;
         enableCacheFor?: string[];
+        disableCacheFor?: string[];
     },
     depsRaw: Partial<typeof defaultDeps> = defaultDeps
 ) => {
@@ -83,7 +98,7 @@ export const initializeCacheState = (
     let state = withSymbol?.[pipelineState];
 
     if (!state) {
-        const location = relative(process.cwd(), ctx?.location || '.');
+        const location = ctx?.location || '.';
 
         if (location) {
             logger.debug('Initializing new pipeline state', {
@@ -104,6 +119,8 @@ export const initializeCacheState = (
             log: [],
             saveToCache: ctx?.saveToCache ?? true,
             enableCacheFor: ctx?.enableCacheFor,
+            disableCacheFor: ctx?.disableCacheFor,
+            internal_unboundCacheLookup: false,
             deps,
         };
     }
@@ -142,6 +159,7 @@ export function createCachedPipeline<Input, Output>(opts: {
      * Location of the cache
      */
     location: string;
+
     /**
      * Whether to save the results of the pipeline to the cache or not, this
      * allows disabling modifications to the cache to ensure currently cached
@@ -151,27 +169,48 @@ export function createCachedPipeline<Input, Output>(opts: {
      * get the hit.
      */
     saveToCache: boolean;
+
     /**
-     * Whether to load the cache for the given function or not, this allows us
-     * to disable the cache for certain functions only for debugging and testing
-     * purposes.
+     * Enable cache for specific steps - you can specify the name of the step
+     * or a name followed by a hash of the cache entry.
      */
     enableCacheFor?: string[];
 
-    pipeline: (input: Input) => Promise<Output>;
+    /**
+     * Disable cache for specific steps - you can specify the name of the step
+     * or a name followed by a hash of the cache entry.
+     */
+    disableCacheFor?: string[];
+
+    /**
+     * Whether to clean the cache at the end of the execution or not, this is
+     * disabled by default
+     */
+    cleanCache?: boolean;
+
+    /**
+     * Whether to clean the root directory as well, or only the subdirectories
+     * that appeared in the log
+     */
+    cleanRoot?: boolean;
+
+    pipeline: (input: Input, ctx: CacheStateRef) => Promise<Output>;
 }) {
     const ctx = {
         location: opts.location,
         saveToCache: opts.saveToCache,
         enableCacheFor: opts.enableCacheFor,
+        disableCacheFor: opts.disableCacheFor,
     };
 
     initializeCacheState(ctx);
 
-    const executePipeline = async (input: Input) => {
+    const executePipeline = async (input: Input, ctx: CacheStateRef) => {
         try {
-            const result = await opts.pipeline(input);
-            await cleanCache(ctx);
+            const result = await opts.pipeline(input, ctx);
+            if (opts.cleanCache) {
+                await cleanCache({ cleanRoot: opts.cleanRoot ?? false }, ctx);
+            }
             return result;
         } finally {
             logExecutionLog(ctx);
@@ -180,7 +219,7 @@ export function createCachedPipeline<Input, Output>(opts: {
 
     const execute = async (input: Input) => {
         return asyncLocalStorage.run(ctx, () => {
-            return executePipeline(input);
+            return executePipeline(input, ctx);
         });
     };
 
diff --git a/packages/refactor-bot/src/chat-gpt/api.ts b/packages/refactor-bot/src/chat-gpt/api.ts
index d96701d..016b9a8 100644
--- a/packages/refactor-bot/src/chat-gpt/api.ts
+++ b/packages/refactor-bot/src/chat-gpt/api.ts
@@ -251,8 +251,8 @@ export function estimatePrice(
 }
 
 export function calculatePrice(
-    opts: Response & {
-        model: Models;
+    opts: Pick<Response, 'usage'> & {
+        model: string;
     }
 ) {
     const model = opts.model;
@@ -312,13 +312,13 @@ export async function chatCompletions(opts: Opts): Promise<Response> {
             ...(opts.functionCall && {
                 function_call: opts.functionCall,
             }),
-            ...(opts.maxTokens && {
+            ...(typeof opts.maxTokens === 'number' && {
                 max_tokens: opts.maxTokens,
             }),
-            ...(opts.temperature && {
+            ...(typeof opts.temperature === 'number' && {
                 temperature: opts.temperature,
             }),
-            ...(opts.choices && {
+            ...(typeof opts.choices === 'number' && {
                 n: opts.choices,
             }),
         } satisfies BodyShape),
diff --git a/packages/refactor-bot/src/child-process/spawnResult.ts b/packages/refactor-bot/src/child-process/spawnResult.ts
index 7d4f5f7..196e137 100644
--- a/packages/refactor-bot/src/child-process/spawnResult.ts
+++ b/packages/refactor-bot/src/child-process/spawnResult.ts
@@ -1,6 +1,7 @@
 import assert from 'assert';
 
 import { logger } from '../logger/logger';
+import { ensureHasOneElement } from '../utils/hasOne';
 import { UnreachableError } from '../utils/UnreachableError';
 import type { SpawnParameterMix, SpawnToPromiseOpts } from './spawnToPromise';
 import { spawnToPromise, spawnWithSpawnParameters } from './spawnToPromise';
@@ -25,6 +26,7 @@ export type SpawnResultReturn = {
     status: number | null;
     signal: NodeJS.Signals | null;
     error?: Error | undefined;
+    args: [string, ...string[]];
 };
 
 export async function spawnResult(
@@ -85,6 +87,9 @@ export async function spawnResult(
         pid: child.pid,
         signal: child.signalCode,
         status: child.exitCode,
+        get args(): [string, ...string[]] {
+            return ensureHasOneElement(child.spawnargs);
+        },
         get output() {
             return combinedData;
         },
diff --git a/packages/refactor-bot/src/evaluate/evaluateFileChanges.ts b/packages/refactor-bot/src/evaluate/evaluateFileChanges.ts
new file mode 100644
index 0000000..cc96ce6
--- /dev/null
+++ b/packages/refactor-bot/src/evaluate/evaluateFileChanges.ts
@@ -0,0 +1,215 @@
+import { z } from 'zod';
+
+import { makeCachedFunction } from '../cache/makeCachedFunction';
+import type { CacheStateRef } from '../cache/types';
+import type { RegularAssistantMessage } from '../chat-gpt/api';
+import { markdown } from '../markdown/markdown';
+import { formatBulletList } from '../prompt-formatters/formatBulletList';
+import { formatFileContents } from '../prompt-formatters/formatFileContents';
+import { formatFileDiff } from '../prompt-formatters/formatFileDiff';
+import { formatOptional } from '../prompt-formatters/formatOptional';
+import { formatZodError } from '../prompt-formatters/formatZodError';
+import {
+    prompt,
+    promptParametersFrom,
+    refactorConfigPromptOptsSchema,
+} from '../refactor/prompt';
+import { parseJsonResponse } from '../response-parsers/parseJsonResponse';
+import { format } from '../text/format';
+import { line } from '../text/line';
+import { ensureHasOneElement } from '../utils/hasOne';
+
+export const evaluateFileChangesInput = refactorConfigPromptOptsSchema.augment({
+    requirements: z.array(z.string()).nonempty(),
+    filePath: z.string(),
+    fileContentsBefore: z.string().optional(),
+    fileContentsAfter: z.string().optional(),
+    fileDiff: z.string().optional(),
+    issues: z.array(z.string()).optional(),
+    index: z.number().optional(),
+    choices: z.number().optional(),
+    temperature: z.number().optional(),
+});
+
+const promptResponseSchema = z.object({
+    summary: z.string(),
+    requirements: z.array(
+        z.object({
+            description: z.string(),
+            satisfied: z.boolean(),
+        })
+    ),
+});
+
+export const evaluateFileResultSchema = z.object({
+    key: z.string().optional(),
+    choices: z.array(promptResponseSchema).nonempty(),
+});
+
+const systemPromptText = markdown`
+    Think step by step. Be concise and to the point. Do not make assumptions and
+    follow instructions exactly.
+`;
+
+const promptText = (opts: {
+    requirements: string[];
+    filePath: string;
+    fileContentsBefore: string;
+    fileContentsAfter: string;
+    fileDiff: string;
+    issues: string[];
+}) =>
+    format(
+        markdown`
+            <requirements>
+            %requirements%
+            </requirements>
+
+            Given above requirements, an algorithm has made modifications to the
+            file at \`%filePath%\`. We now want to assess changes in that file.
+            Strictly focus only on the file mentioned.
+
+            Below information is available to you and applies only to the file
+            \`%filePath%\`:
+
+            %fileDiff%
+
+            %fileContentsAfter%
+
+            %issues%
+
+            Assess the changes made by the algorithm and determine whether the
+            requirement was satisfied or not. Do not add new requirements. Do
+            not rephrase requirements. Do not change order of the requirements.
+            Do not assess the quality of the implementation in relation to the
+            requirement, but focus on the outcome. The number of requirements
+            returned should be the same as the number of requirements given.
+
+            Return the following JSON result when you are done:
+
+            ~~~json
+            {
+                "summary": "<summary of the assessment>",
+                "requirements": [
+                    {
+                        "description": "<requirement>",
+                        "satisfied": <true/false - whether requirement is satisfied or not>
+                    }
+                ]
+            }
+            ~~~
+
+            In your final response, only include the JSON in the above format
+            wrapped with the code block. Do not include any other text or
+            output.
+        `,
+        {
+            requirements: formatBulletList({
+                items: ensureHasOneElement(opts.requirements),
+                heading: `List of requirements:`,
+            }),
+            filePath: opts.filePath,
+            fileDiff: formatOptional({
+                heading: 'Following changes were made to the file:',
+                text: formatFileDiff({
+                    fileDiff: opts.fileDiff,
+                }),
+            }),
+            fileContentsAfter: formatOptional({
+                heading: 'File contents after the changes:',
+                text: formatFileContents({
+                    fileContents: opts.fileContentsAfter,
+                    filePath: opts.filePath,
+                }),
+            }),
+            issues: formatBulletList({
+                heading: 'Following issues were found after the changes:',
+                empty: line`
+                    We've checked the code for issues - no issues were found.
+                `,
+                items: opts.issues,
+            }),
+        }
+    );
+
+export const evaluateFileChanges = makeCachedFunction({
+    name: 'eval-file',
+    inputSchema: evaluateFileChangesInput,
+    resultSchema: evaluateFileResultSchema,
+    transform: async (
+        input: z.input<typeof evaluateFileChangesInput>,
+        ctx?: CacheStateRef
+    ) => {
+        const {
+            filePath,
+            requirements,
+            fileDiff,
+            fileContentsBefore,
+            fileContentsAfter,
+            issues = [],
+        } = input;
+
+        const validateResponse = (message: RegularAssistantMessage) =>
+            parseJsonResponse(
+                message.content,
+                z.object({
+                    summary: z.string(),
+                    requirements: z.array(
+                        z.object({
+                            description: z.string(),
+                            satisfied: z.boolean(),
+                        })
+                    ),
+                })
+            );
+
+        const promptParams = promptParametersFrom(
+            {
+                ...input,
+                allowedFunctions: [],
+            },
+            ctx
+        );
+
+        const result = await prompt(
+            {
+                preface: systemPromptText,
+                prompt: promptText({
+                    filePath,
+                    requirements,
+                    fileDiff: fileDiff || '',
+                    fileContentsBefore: fileContentsBefore || '',
+                    fileContentsAfter: fileContentsAfter || '',
+                    issues,
+                }),
+                temperature: input.temperature ?? 0.2,
+                choices: input.choices,
+                shouldStop: (message) => {
+                    try {
+                        validateResponse(message);
+                        return true;
+                    } catch (err) {
+                        if (err instanceof z.ZodError) {
+                            return formatZodError({
+                                error: err,
+                            });
+                        }
+                        return String(err);
+                    }
+                },
+                ...promptParams,
+                seed: input.index?.toString(),
+            },
+            ctx
+        );
+
+        return {
+            key: result.key,
+            choices: ensureHasOneElement(
+                result.choices.map((choice) =>
+                    validateResponse(choice.resultingMessage)
+                )
+            ),
+        };
+    },
+});
diff --git a/packages/refactor-bot/src/evaluate/evaluateFileScore.ts b/packages/refactor-bot/src/evaluate/evaluateFileScore.ts
new file mode 100644
index 0000000..3ba8e4b
--- /dev/null
+++ b/packages/refactor-bot/src/evaluate/evaluateFileScore.ts
@@ -0,0 +1,85 @@
+import { z } from 'zod';
+
+import type { CacheStateRef } from '../cache/types';
+import { gitShowFile } from '../git/gitShowFile';
+import { gitShowFileCommitRangeSummary } from '../git/gitShowFileCommitSummary';
+import {
+    evaluateFileChanges,
+    evaluateFileChangesInput,
+} from './evaluateFileChanges';
+import { evaluateUnchangedFile } from './evaluateUnchangedFile';
+import { avg } from './utils/avg';
+
+export const evaluateFileScoreSchema = evaluateFileChangesInput
+    .omit({
+        fileContentsAfter: true,
+        fileContentsBefore: true,
+        fileDiff: true,
+    })
+    .augment({
+        commit: z.string().optional(),
+        commitBeforeChanges: z.string(),
+    });
+
+export const evaluateFileScoreResultSchema = z.object({
+    key: z.string().optional(),
+    score: z.number(),
+});
+
+/**
+ * Determines if the file has changed since the last commit and evaluates the
+ * changes if so, then aggregates the results into a single score.
+ */
+export const evaluateFileScore = async (
+    input: z.input<typeof evaluateFileScoreSchema>,
+    ctx?: CacheStateRef
+) => {
+    const { sandboxDirectoryPath, commit, commitBeforeChanges, filePath } =
+        input;
+
+    const changeInfo = commit
+        ? await gitShowFileCommitRangeSummary({
+              location: sandboxDirectoryPath,
+              filePath: filePath,
+              from: commitBeforeChanges,
+              to: commit,
+          })
+        : undefined;
+
+    const evaluationResults =
+        changeInfo && changeInfo.fileDiff
+            ? await evaluateFileChanges(
+                  {
+                      ...input,
+                      ...changeInfo,
+                  },
+                  ctx
+              )
+            : await evaluateUnchangedFile(
+                  {
+                      ...input,
+                      fileContents: await gitShowFile({
+                          location: sandboxDirectoryPath,
+                          filePath: input.filePath,
+                          ref: input.commitBeforeChanges,
+                      }),
+                  },
+                  ctx
+              );
+
+    const scoreChoices = evaluationResults.choices.map((choice) => {
+        const hit = choice.requirements.filter((r) => r.satisfied).length;
+
+        const score = hit / choice.requirements.length;
+
+        return {
+            score,
+            summary: choice.summary,
+        };
+    });
+
+    return {
+        key: evaluationResults.key,
+        score: avg(scoreChoices.map((choice) => choice.score)),
+    };
+};
diff --git a/packages/refactor-bot/src/evaluate/evaluateRefactorResult.ts b/packages/refactor-bot/src/evaluate/evaluateRefactorResult.ts
new file mode 100644
index 0000000..21ee9e5
--- /dev/null
+++ b/packages/refactor-bot/src/evaluate/evaluateRefactorResult.ts
@@ -0,0 +1,267 @@
+import assert from 'assert';
+import type { ObservedValueOf } from 'rxjs';
+import {
+    concatMap,
+    from,
+    lastValueFrom,
+    map,
+    mergeMap,
+    range,
+    scan,
+    startWith,
+    toArray,
+    zip,
+} from 'rxjs';
+import { z } from 'zod';
+
+import { refactorResultSchema } from '../benchmark/refactorResultSchema';
+import { makeCachedFunction } from '../cache/makeCachedFunction';
+import type { CacheStateRef } from '../cache/types';
+import { modelsSchema } from '../chat-gpt/api';
+import { summarizeRefactorFilesResult } from '../refactor/types';
+import { evaluateFileScore } from './evaluateFileScore';
+import { extractRequirements } from './extractRequirements';
+import { avg } from './utils/avg';
+
+const evaluateRefactorResultOptsSchema = z.object({
+    /**
+     * Model to use for evaluation
+     */
+    model: modelsSchema.optional(),
+
+    /**
+     * Temperature to use with the LLM for evaluation
+     */
+    temperature: z.number().optional(),
+
+    /**
+     * Number of choices to request from the model
+     */
+    choices: z.number(),
+
+    /**
+     * Results of the refactor we are evaluating
+     */
+    result: refactorResultSchema,
+
+    index: z.number().optional(),
+});
+
+const evaluateRefactorResultOnceSchema =
+    evaluateRefactorResultOptsSchema.augment({
+        /**
+         * Objective split into requirements, we do the splitting
+         * beforehand to avoid LLM trying to do multiple things at once and/or
+         * producing extra unnecessary variance
+         */
+        requirements: z.array(z.string()).nonempty(),
+    });
+
+const evaluateRefactorResultOnce = makeCachedFunction({
+    inputSchema: evaluateRefactorResultOnceSchema,
+    resultSchema: z.object({
+        key: z.string().optional(),
+        score: z.number(),
+        scorePerFile: z.record(z.number()),
+    }),
+    name: 'eval-refactor',
+    transform: async (
+        opts: z.output<typeof evaluateRefactorResultOnceSchema>,
+        ctx?: CacheStateRef
+    ) => {
+        const {
+            requirements,
+            result: { sandboxDirectoryPath, startCommit },
+            model,
+        } = opts;
+
+        const { accepted, discarded } = summarizeRefactorFilesResult({
+            accepted: opts.result.accepted,
+            discarded: opts.result.discarded,
+        });
+
+        const stepsPerFile = from([
+            ...Object.entries({
+                ...discarded.resultsByFilePaths,
+                ...accepted.resultsByFilePaths,
+            }),
+        ]).pipe(
+            mergeMap(([filePath, results]) =>
+                from(results).pipe(
+                    mergeMap(({ file: result }) =>
+                        result.steps.map((step) => ({
+                            filePath,
+                            task: step.task,
+                            timestamp: step.timestamp,
+                            commit: step.commit,
+                            checkSummary: step.checkSummary,
+                        }))
+                    ),
+                    (stream) =>
+                        zip(
+                            stream,
+                            stream.pipe(
+                                scan(
+                                    (commitBeforeChanges, next) =>
+                                        /**
+                                         * When an attempt to refactor a file
+                                         * resulted in no commits, take previous
+                                         * commit as a commitBeforeChanges.
+                                         */ next.commit
+                                            ? next.commit
+                                            : commitBeforeChanges,
+                                    startCommit
+                                ),
+                                // make this one lag one step behind
+                                startWith(startCommit)
+                            )
+                        ),
+                    map(([step, commitBeforeChanges]) => ({
+                        ...step,
+                        commitBeforeChanges,
+                    })),
+                    toArray(),
+                    map((steps) => ({
+                        filePath,
+                        steps,
+                    }))
+                )
+            ),
+            map(({ filePath, steps }) => {
+                const first = steps[0];
+                const last = steps[steps.length - 1];
+                if (first === last) {
+                    return {
+                        filePath,
+                        steps,
+                    };
+                } else {
+                    return {
+                        filePath,
+                        steps: [
+                            {
+                                ...last,
+                                commitBeforeChanges:
+                                    first?.commitBeforeChanges || startCommit,
+                                filePath,
+                            },
+                        ],
+                    };
+                }
+            })
+        );
+
+        const evaluateStep = async (
+            step: ObservedValueOf<typeof stepsPerFile>['steps'][number]
+        ) => {
+            const issues = [
+                ...(step.checkSummary?.newIssues || []),
+                ...(step.checkSummary?.remainingIssues || []),
+            ].map((issue) => issue.issue);
+
+            return {
+                ...(await evaluateFileScore(
+                    {
+                        sandboxDirectoryPath,
+                        index: opts.index,
+                        requirements,
+                        filePath: step.filePath,
+                        commitBeforeChanges: step.commitBeforeChanges,
+                        commit: step.commit,
+                        issues,
+                        model,
+                    },
+                    ctx
+                )),
+                filePath: step.filePath,
+            };
+        };
+
+        const scorePerFile = await lastValueFrom(
+            stepsPerFile.pipe(
+                mergeMap(({ steps, filePath }) => {
+                    return from(steps).pipe(
+                        //
+                        concatMap(evaluateStep),
+                        toArray(),
+                        map((steps) => {
+                            assert(
+                                steps.every(
+                                    (step) => step.filePath === filePath
+                                )
+                            );
+                            return {
+                                filePath,
+                                score: avg(steps.map((r) => r.score)),
+                                steps,
+                            };
+                        })
+                    );
+                }, 2),
+                toArray()
+            )
+        );
+
+        const score = avg(scorePerFile.map((r) => r.score));
+
+        return {
+            key: ctx?.location,
+            score,
+            scorePerFile: Object.fromEntries(
+                scorePerFile.map((r) => [r.filePath, r.score])
+            ),
+        };
+    },
+});
+
+export async function evaluateRefactorResult(
+    opts: z.output<typeof evaluateRefactorResultOptsSchema> & {
+        times: number;
+    },
+    ctx?: CacheStateRef
+) {
+    const extractRequirementsResult = await extractRequirements({
+        objective: opts.result.objective,
+        sandboxDirectoryPath: opts.result.sandboxDirectoryPath,
+        choices: 2,
+        temperature: opts.temperature,
+    });
+
+    const requirements = extractRequirementsResult.choices.reduce(
+        (acc, choice) =>
+            choice.requirements.length > acc.length ? acc : choice.requirements,
+        extractRequirementsResult.choices[0].requirements
+    );
+
+    return await lastValueFrom(
+        range(0, opts.times).pipe(
+            mergeMap(
+                (index) =>
+                    evaluateRefactorResultOnce(
+                        { ...opts, requirements, index },
+                        ctx
+                    ),
+                4
+            ),
+            toArray(),
+            map((evaluations) => {
+                const score = avg(evaluations.map((r) => r.score));
+                const filePaths = new Set(
+                    evaluations.flatMap((r) => Object.keys(r.scorePerFile))
+                );
+                return {
+                    score,
+                    evaluations,
+                    scorePerFile: Object.fromEntries(
+                        [...filePaths].map((filePath) => {
+                            const scores = evaluations
+                                .map((r) => r.scorePerFile[filePath])
+                                .filter((s) => s !== undefined) as number[];
+                            return [filePath, avg(scores)];
+                        })
+                    ),
+                };
+            })
+        )
+    );
+}
diff --git a/packages/refactor-bot/src/evaluate/evaluateUnchangedFile.ts b/packages/refactor-bot/src/evaluate/evaluateUnchangedFile.ts
new file mode 100644
index 0000000..a118a5a
--- /dev/null
+++ b/packages/refactor-bot/src/evaluate/evaluateUnchangedFile.ts
@@ -0,0 +1,181 @@
+import { z } from 'zod';
+
+import { makeCachedFunction } from '../cache/makeCachedFunction';
+import type { CacheStateRef } from '../cache/types';
+import type { RegularAssistantMessage } from '../chat-gpt/api';
+import { markdown } from '../markdown/markdown';
+import { formatBulletList } from '../prompt-formatters/formatBulletList';
+import { formatFileContents } from '../prompt-formatters/formatFileContents';
+import { formatOptional } from '../prompt-formatters/formatOptional';
+import {
+    prompt,
+    promptParametersFrom,
+    refactorConfigPromptOptsSchema,
+} from '../refactor/prompt';
+import { parseJsonResponse } from '../response-parsers/parseJsonResponse';
+import { format } from '../text/format';
+import { line } from '../text/line';
+import { ensureHasOneElement } from '../utils/hasOne';
+import { evaluateFileResultSchema } from './evaluateFileChanges';
+
+export const evaluateUnchangedFileInput =
+    refactorConfigPromptOptsSchema.augment({
+        requirements: z.array(z.string()).nonempty(),
+        filePath: z.string(),
+        fileContents: z.string(),
+        issues: z.array(z.string()).optional(),
+        index: z.number().optional(),
+        choices: z.number().optional(),
+        temperature: z.number().optional(),
+    });
+
+const systemPromptText = markdown`
+    Think step by step. Be concise and to the point. Do not make assumptions
+    other than what was given in the instructions.
+`;
+
+const promptText = (opts: {
+    requirements: [string, ...string[]];
+    filePath: string;
+    fileContents: string;
+    issues?: string[];
+}) =>
+    format(
+        markdown`
+            <requirements>
+            %requirements%
+            </requirements>
+
+            Given above requirements, an algorithm has decided to include the
+            file \`%filePath%\` into refactoring, but after that, decided not to
+            make modifications to that file. We now want to assess whether the
+            decision aligns with the requirements.
+
+            Below information is available to you and applies only to the file
+            \`%filePath%\`:
+
+            %fileContents%
+
+            %issues%
+
+            Assess the decision made by the algorithm and determine whether the
+            requirement was satisfied or not. Do not add new requirements. Do
+            not rephrase requirements. Do not change order of the requirements.
+            Do not assess the quality of the implementation in relation to the
+            requirement, but focus on the outcome. The number of requirements
+            returned should be the same as the number of requirements given.
+
+            Return the following JSON result when you are done:
+
+            ~~~json
+            {
+                "summary": "<summary of the assessment>",
+                "requirements": [
+                    {
+                        "description": "<short description of the requirement>",
+                        "satisfied": true
+                    }
+                ]
+            }
+            ~~~
+
+            In your final response, only include the JSON in the above format
+            wrapped with the code block. Do not include any other text or
+            output.
+        `,
+        {
+            requirements: formatBulletList({
+                items: ensureHasOneElement(opts.requirements),
+                heading: `List of requirements:`,
+            }),
+            filePath: opts.filePath,
+            fileContents: formatOptional({
+                heading: 'File contents:',
+                text: formatFileContents({
+                    fileContents: opts.fileContents,
+                    filePath: opts.filePath,
+                }),
+            }),
+            issues: opts.issues
+                ? formatBulletList({
+                      heading: 'Following issues were found in the file:',
+                      empty: line`
+                          We've checked the file for issues - no issues were
+                          found.
+                      `,
+                      items: opts.issues,
+                  })
+                : '',
+        }
+    );
+
+export const evaluateUnchangedFile = makeCachedFunction({
+    name: 'eval-file',
+    inputSchema: evaluateUnchangedFileInput,
+    resultSchema: evaluateFileResultSchema,
+    transform: async (
+        input: z.input<typeof evaluateUnchangedFileInput>,
+        ctx?: CacheStateRef
+    ) => {
+        const { filePath, requirements, fileContents, issues } = input;
+
+        const validateResponse = (message: RegularAssistantMessage) =>
+            parseJsonResponse(
+                message.content,
+                z.object({
+                    summary: z.string(),
+                    requirements: z.array(
+                        z.object({
+                            description: z.string(),
+                            satisfied: z.boolean(),
+                        })
+                    ),
+                })
+            );
+
+        const promptParams = promptParametersFrom(
+            {
+                ...input,
+                allowedFunctions: [],
+            },
+            ctx
+        );
+
+        const result = await prompt(
+            {
+                preface: systemPromptText,
+                prompt: promptText({
+                    filePath,
+                    requirements,
+                    fileContents,
+                    issues,
+                }),
+                temperature: input.temperature ?? 0.2,
+                choices: input.choices,
+                shouldStop: (message) => {
+                    try {
+                        validateResponse(message);
+                        return true;
+                    } catch (err) {
+                        if (err instanceof z.ZodError) {
+                            return err.message;
+                        }
+                        return String(err);
+                    }
+                },
+                ...promptParams,
+                seed: input.index?.toString(),
+            },
+            ctx
+        );
+
+        return {
+            key: result.key,
+            choices: ensureHasOneElement(
+                result.choices.map((choice) =>
+                    validateResponse(choice.resultingMessage)
+                )
+            ),
+        };
+    },
+});
diff --git a/packages/refactor-bot/src/evaluate/extractRequirements.ts b/packages/refactor-bot/src/evaluate/extractRequirements.ts
new file mode 100644
index 0000000..836322b
--- /dev/null
+++ b/packages/refactor-bot/src/evaluate/extractRequirements.ts
@@ -0,0 +1,115 @@
+import { z } from 'zod';
+
+import type { CacheStateRef } from '../cache/types';
+import type { RegularAssistantMessage } from '../chat-gpt/api';
+import { markdown } from '../markdown/markdown';
+import { formatZodError } from '../prompt-formatters/formatZodError';
+import {
+    prompt,
+    promptParametersFrom,
+    refactorConfigPromptOptsSchema,
+} from '../refactor/prompt';
+import { parseJsonResponse } from '../response-parsers/parseJsonResponse';
+import { format } from '../text/format';
+import { ensureHasOneElement } from '../utils/hasOne';
+
+export const extractRequirementsInput = refactorConfigPromptOptsSchema.augment({
+    objective: z.string(),
+    choices: z.number().optional(),
+    temperature: z.number().optional(),
+});
+
+const requirementsArraySchema = z.array(z.string()).nonempty();
+
+export const extractRequirementsResult = z.object({
+    key: z.string().optional(),
+    choices: z.array(
+        z.object({
+            requirements: requirementsArraySchema,
+        })
+    ),
+});
+
+const systemPromptText = markdown`
+    Think step by step. Be concise and to the point. Do not make assumptions and
+    follow instructions exactly.
+`;
+
+const promptText = (opts: { objective: string }) =>
+    format(
+        markdown`
+            <objective>
+            %objective%
+            </objective>
+
+            Given above objective, split it into requirements.
+
+            Do not modify the requirements. Do not add new requirements. Do not
+            try to guess what the implementation would look like. Do not look
+            too far ahead. Do not rephrase the requirements. Only extract the
+            minimum which is explicitly specified. Do not include implied
+            requirements. There should be a minimum of 1 requirements as a
+            result. Strive to minimize the number of requirements. Requirements
+            should not be redundant. There should be no duplicate requirements.
+
+            Return list of requirements in the following format:
+
+            ~~~json
+            ["Requirement 1", "Requirement 2"]
+            ~~~
+        `,
+        {
+            objective: opts.objective,
+        }
+    );
+
+export const extractRequirements = async (
+    input: z.input<typeof extractRequirementsInput>,
+    ctx?: CacheStateRef
+) => {
+    const { objective } = input;
+
+    const validateResponse = (message: RegularAssistantMessage) =>
+        parseJsonResponse(message.content, requirementsArraySchema);
+
+    const promptParams = promptParametersFrom(input, ctx);
+
+    const result = await prompt(
+        {
+            preface: systemPromptText,
+            prompt: promptText({
+                objective,
+            }),
+            temperature: input.temperature ?? 0.2,
+            choices: input.choices,
+            shouldStop: (message) => {
+                try {
+                    validateResponse(message);
+                    return true;
+                } catch (err) {
+                    if (err instanceof z.ZodError) {
+                        return formatZodError({
+                            error: err,
+                        });
+                    }
+                    return String(err);
+                }
+            },
+            ...promptParams,
+            functionsConfig: {
+                ...promptParams.functionsConfig,
+                allowedFunctions: [],
+            },
+        },
+        ctx
+    );
+
+    return {
+        key: result.key,
+        choices: ensureHasOneElement(
+            result.choices.map((choice) => ({
+                requirements: validateResponse(choice.resultingMessage),
+            }))
+        ),
+    };
+};
diff --git a/packages/refactor-bot/src/evaluate/utils/avg.ts b/packages/refactor-bot/src/evaluate/utils/avg.ts
new file mode 100644
index 0000000..2afaa8e
--- /dev/null
+++ b/packages/refactor-bot/src/evaluate/utils/avg.ts
@@ -0,0 +1,5 @@
+import { sum } from './sum';
+
+export function avg(values: number[]) {
+    return sum(values) / values.length;
+}
diff --git a/packages/refactor-bot/src/evaluate/utils/sum.ts b/packages/refactor-bot/src/evaluate/utils/sum.ts
new file mode 100644
index 0000000..7ab4fcd
--- /dev/null
+++ b/packages/refactor-bot/src/evaluate/utils/sum.ts
@@ -0,0 +1,3 @@
+export function sum(values: number[]) {
+    return values.reduce((acc, value) => acc + value, 0);
+}
diff --git a/packages/refactor-bot/src/git/gitShowFileCommitSummary.ts b/packages/refactor-bot/src/git/gitShowFileCommitSummary.ts
new file mode 100644
index 0000000..8c91357
--- /dev/null
+++ b/packages/refactor-bot/src/git/gitShowFileCommitSummary.ts
@@ -0,0 +1,41 @@
+import { gitFilesDiff } from './gitFilesDiff';
+import { gitShowFile } from './gitShowFile';
+
+export async function gitShowFileCommitSummary(opts: {
+    location: string;
+    filePath: string;
+    ref: string;
+}) {
+    return await gitShowFileCommitRangeSummary({
+        ...opts,
+        from: `${opts.ref}~1`,
+        to: opts.ref,
+    });
+}
+
+export async function gitShowFileCommitRangeSummary(opts: {
+    location: string;
+    filePath: string;
+    from: string;
+    to: string;
+}) {
+    const { location, filePath, from, to } = opts;
+    return {
+        filePath,
+        fileContentsBefore: await gitShowFile({
+            location,
+            filePath,
+            ref: from,
+        }),
+        fileContentsAfter: await gitShowFile({
+            location,
+            filePath,
+            ref: to,
+        }),
+        fileDiff: await gitFilesDiff({
+            location,
+            filePaths: [filePath],
+            ref: `${from}...${to}`,
+        }),
+    };
+}
diff --git a/packages/refactor-bot/src/logger/asyncFormatter.ts b/packages/refactor-bot/src/logger/asyncFormatter.ts
index 42d78aa..ca7de8d 100644
--- a/packages/refactor-bot/src/logger/asyncFormatter.ts
+++ b/packages/refactor-bot/src/logger/asyncFormatter.ts
@@ -4,7 +4,7 @@ import type { TransformCallback } from 'stream';
 
 import { onceAsync } from '../utils/onceAsync';
 
-export type FormatterMethod = <Y>(value: Y) => string | Y | Promise<string | Y>;
+export type FormatterMethod = (value: unknown) => unknown;
 
 export type Formatters = {
     [key: string]: FormatterMethod;
diff --git a/packages/refactor-bot/src/logger/formatters.ts b/packages/refactor-bot/src/logger/formatters.ts
index 89a83ae..1e49f18 100644
--- a/packages/refactor-bot/src/logger/formatters.ts
+++ b/packages/refactor-bot/src/logger/formatters.ts
@@ -1,7 +1,27 @@
+import { relative } from 'path';
+
 import { glowFormat } from '../markdown/glowFormat';
 import type { Formatters } from './asyncFormatter';
 import { formatObject } from './formatObject';
 
+function formatLocation(input: string) {
+    const cwd = process.env['LOG_RELATIVE_TO_CWD'] ?? process.cwd();
+    if (input.startsWith(cwd)) {
+        return relative(cwd, input);
+    }
+    return input;
+}
+
+function formatLocationArray(input: unknown) {
+    if (Array.isArray(input)) {
+        return input.map((item) =>
+            typeof item === 'string' ? formatLocation(item) : (item as unknown)
+        );
+    } else {
+        return input;
+    }
+}
+
 export const formatters: Formatters = {
     objective: (input) =>
         typeof input === 'string' ? glowFormat({ input }) : input,
@@ -9,4 +29,13 @@ export const formatters: Formatters = {
         typeof input === 'object' && input !== null
             ? formatObject(input)
             : input,
+    location: (input) => {
+        if (typeof input === 'string') {
+            return formatLocation(input);
+        } else {
+            return input;
+        }
+    },
+    executionLog: (input) => formatLocationArray(input),
+    resultFilePaths: (input) => formatLocationArray(input),
 };
diff --git a/packages/refactor-bot/src/package-manager/runCheckCommand.ts b/packages/refactor-bot/src/package-manager/runCheckCommand.ts
index 3e483d2..d017fee 100644
--- a/packages/refactor-bot/src/package-manager/runCheckCommand.ts
+++ b/packages/refactor-bot/src/package-manager/runCheckCommand.ts
@@ -123,7 +123,7 @@ export async function runCheckCommandWithParser(opts: {
     outputParser: (output: string) => Issue[];
     location: string;
 }) {
-    const { stdout, stderr } = await runPackageManagerScript({
+    const { stdout, stderr, args } = await runPackageManagerScript({
         packageManager: opts.packageManager,
         script: opts.script.args[0],
         args: opts.script.args.slice(1),
@@ -151,16 +151,19 @@ export async function runCheckCommandWithParser(opts: {
         () => opts.location
     );
 
-    return opts.outputParser(chooseOutput()).map((data) => {
-        const issue = data.issue.replaceAll(parentDirectoryRegex, '');
-        return {
-            ...data,
-            issue,
-            filePath: isAbsolute(data.filePath)
-                ? relative(realLocation, data.filePath)
-                : data.filePath,
-        };
-    });
+    return {
+        args,
+        issues: opts.outputParser(chooseOutput()).map((data) => {
+            const issue = data.issue.replaceAll(parentDirectoryRegex, '');
+            return {
+                ...data,
+                issue,
+                filePath: isAbsolute(data.filePath)
+                    ? relative(realLocation, data.filePath)
+                    : data.filePath,
+            };
+        }),
+    };
 }
 
 export async function runCheckCommand(opts: {
diff --git a/packages/refactor-bot/src/playground.ts b/packages/refactor-bot/src/playground.ts
index 6fee480..4f26e99 100644
--- a/packages/refactor-bot/src/playground.ts
+++ b/packages/refactor-bot/src/playground.ts
@@ -1,45 +1,56 @@
-import { gitFilesDiff } from './git/gitFilesDiff';
-import { gitShowFile } from './git/gitShowFile';
+import { join } from 'path';
+import { ignoreElements, tap } from 'rxjs';
+
+import { bootstrap } from './bootstrap';
+import { evaluateFileChanges } from './evaluate/evaluateFileChanges';
+import { runEpic } from './event-bus';
+import { gitShowFileCommitSummary } from './git/gitShowFileCommitSummary';
 import { logger } from './logger/logger';
-import { changeInfo } from './ts-morph/quick-info/changeInfo';
 
 /**
  * @note this is a playground file, it is not used in production, I basically
  * run this file in watch mode to see if the changes I make to the codebase
- * are working as expected as I don't have a good way to test with "ts-morph"
- * in dependencies
+ * are working as expected - because sometimes you just don't want to commit to
+ * maintaining tests (especially if they require a lot of "test-cases" and
+ * setup "data" to go along with them - this is especially true for functions
+ * that work with codebase)
  *
  * @note want to test something? try it yourself:
  * pnpm tsx --watch ./src/playground.ts
- *
- * @note try Quokka VSCode extension for a better playground experience
  */
 
 const location =
-    '/private/var/folders/kk/cj4kd1wx1rg16xq_jhdw75kc0000gn/T/.refactor-bot/sandboxes/replace-read-file-sync-unr87ijk';
+    '/var/folders/kk/cj4kd1wx1rg16xq_jhdw75kc0000gn/T/.refactor-bot/sandboxes/test-likely-failure-oTH22sfc';
 
-const filePath = 'src/pipeline/dependencies.ts';
-
-const fileDiff = await gitFilesDiff({
-    location,
-    filePaths: [filePath],
-    ref: 'HEAD~1',
-});
+runEpic((stream) =>
+    stream.pipe(
+        tap((event) => {
+            logger.trace(event);
+        }),
+        ignoreElements()
+    )
+);
 
 logger.info(
-    await changeInfo({
-        location,
-        filePath,
-        oldFileContents: await gitShowFile({
-            location,
-            filePath,
-            ref: 'HEAD~1',
-        }),
-        newFileContents: await gitShowFile({
-            location,
-            filePath,
-            ref: 'HEAD',
-        }),
-        fileDiff,
+    await bootstrap(async () => {
+        return await evaluateFileChanges(
+            {
+                sandboxDirectoryPath: location,
+                requirements: [
+                    'Replace all usages of `readFile` from `fs/promises` module with `readFileSync` from `fs` module in packages/refactor-bot/src/cache/dependencies.ts`.',
+                ],
+                ...(await gitShowFileCommitSummary({
+                    location,
+                    filePath: 'packages/refactor-bot/src/cache/dependencies.ts',
+                    ref: '28d5f0d1f7985bd16e4d3cc4f26ffd53c1a6f94b',
+                })),
+            },
+            {
+                location: join(
+                    '.refactor-bot/playground-cache',
+                    'evaluateFileChanges'
+                ),
+            }
+        );
     })
 );
diff --git a/packages/refactor-bot/src/prompt-formatters/formatBulletList.ts b/packages/refactor-bot/src/prompt-formatters/formatBulletList.ts
new file mode 100644
index 0000000..2e767ac
--- /dev/null
+++ b/packages/refactor-bot/src/prompt-formatters/formatBulletList.ts
@@ -0,0 +1,17 @@
+export function formatBulletList(opts: {
+    items: string[];
+    empty?: string;
+    heading?: string;
+}) {
+    if (opts.items.length === 0) {
+        return opts.empty || '';
+    }
+
+    const list = opts.items.map((item) => `- ${item}`).join('\n');
+
+    if (opts.heading) {
+        return `${opts.heading}\n${list}`;
+    }
+
+    return list;
+}
diff --git a/packages/refactor-bot/src/prompt-formatters/formatFencedCodeBlock.ts b/packages/refactor-bot/src/prompt-formatters/formatFencedCodeBlock.ts
new file mode 100644
index 0000000..1223431
--- /dev/null
+++ b/packages/refactor-bot/src/prompt-formatters/formatFencedCodeBlock.ts
@@ -0,0 +1,26 @@
+import { markdown } from '../markdown/markdown';
+import { escapeRegExp } from '../utils/escapeRegExp';
+
+export function formatFencedCodeBlock(opts: {
+    code: string;
+    language?: string;
+    marker?: '```' | '~~~';
+}) {
+    if (!opts.code) {
+        return '';
+    }
+
+    const marker =
+        opts.marker ?? opts.code.search(/^```/g) >= 0 ? '~~~' : '```';
+    const candidateMarker = new RegExp(`^${escapeRegExp(marker)}`, 'g');
+
+    if (opts.code.search(candidateMarker) >= 0) {
+        throw new Error(`The code block contains the marker "${marker}"`);
+    }
+
+    return markdown`
+        ${marker}${opts.language ?? ''}
+        ${opts.code}
+        ${marker}
+    `;
+}
diff --git a/packages/refactor-bot/src/prompt-formatters/formatFileContents.ts b/packages/refactor-bot/src/prompt-formatters/formatFileContents.ts
new file mode 100644
index 0000000..90d8514
--- /dev/null
+++ b/packages/refactor-bot/src/prompt-formatters/formatFileContents.ts
@@ -0,0 +1,42 @@
+import { extname } from 'path';
+
+import { formatFencedCodeBlock } from './formatFencedCodeBlock';
+
+export function formatFileContents(opts: {
+    fileContents: string;
+    language?: string;
+    filePath?: string;
+}) {
+    let { language } = opts;
+
+    if (!language && opts.filePath) {
+        const ext = extname(opts.filePath);
+
+        switch (ext) {
+            case '.js':
+                language = 'js';
+                break;
+            case '.ts':
+                language = 'ts';
+                break;
+            case '.json':
+                language = 'json';
+                break;
+            case '.md':
+                language = 'markdown';
+                break;
+            case '.sh':
+                language = 'sh';
+                break;
+            case '.yml':
+            case '.yaml':
+                language = 'yaml';
+                break;
+        }
+    }
+
+    return formatFencedCodeBlock({
+        code: opts.fileContents,
+        language,
+    });
+}
diff --git a/packages/refactor-bot/src/prompt-formatters/formatFileDiff.ts b/packages/refactor-bot/src/prompt-formatters/formatFileDiff.ts
new file mode 100644
index 0000000..e92080d
--- /dev/null
+++ b/packages/refactor-bot/src/prompt-formatters/formatFileDiff.ts
@@ -0,0 +1,8 @@
+import { formatFencedCodeBlock } from './formatFencedCodeBlock';
+
+export function formatFileDiff(opts: { fileDiff: string }) {
+    return formatFencedCodeBlock({
+        code: opts.fileDiff,
+        language: 'diff',
+    });
+}
diff --git a/packages/refactor-bot/src/prompt-formatters/formatOptional.ts b/packages/refactor-bot/src/prompt-formatters/formatOptional.ts
new file mode 100644
index 0000000..8ff1a58
--- /dev/null
+++ b/packages/refactor-bot/src/prompt-formatters/formatOptional.ts
@@ -0,0 +1,12 @@
+export function formatOptional(opts: {
+    text?: string;
+    empty?: string;
+    heading?: string;
+    footer?: string;
+}) {
+    if (!opts.text || opts.text.length === 0) {
+        return opts.empty || '';
+    }
+
+    return [opts.heading, opts.text, opts.footer].filter(Boolean).join('\n');
+}
diff --git a/packages/refactor-bot/src/prompt-formatters/formatZodError.ts b/packages/refactor-bot/src/prompt-formatters/formatZodError.ts
new file mode 100644
index 0000000..0ea189b
--- /dev/null
+++ b/packages/refactor-bot/src/prompt-formatters/formatZodError.ts
@@ -0,0 +1,19 @@
+import type { z } from 'zod';
+
+export function formatZodError(opts: {
+    error: z.ZodError;
+    heading?: string;
+    footer?: string;
+}) {
+    const issues = opts.error.errors
+        .map((error) => `- ${error.message}`)
+        .join('\n');
+
+    return [
+        opts.heading || 'Parsing failed, following issues found:',
+        issues,
+        opts.footer,
+    ]
+        .filter(Boolean)
+        .join('\n');
+}
diff --git a/packages/refactor-bot/src/prompt/cli.ts b/packages/refactor-bot/src/prompt/cli.ts
index b2cbc49..295389a 100644
--- a/packages/refactor-bot/src/prompt/cli.ts
+++ b/packages/refactor-bot/src/prompt/cli.ts
@@ -29,6 +29,12 @@ export const promptCommand: CommandModule<
                 describe:
                     'Prompt for next action confirmation when new messages are received',
                 default: false,
+            })
+            .option('functions', {
+                type: 'string',
+                array: true,
+                describe:
+                    'Names of functions to allow in the prompt. If not specified, all functions are allowed.',
             }),
     handler: async (opts) => {
         try {
diff --git a/packages/refactor-bot/src/prompt/run.ts b/packages/refactor-bot/src/prompt/run.ts
index a7b4b79..b146efe 100644
--- a/packages/refactor-bot/src/prompt/run.ts
+++ b/packages/refactor-bot/src/prompt/run.ts
@@ -199,6 +199,7 @@ export const run = async (opts: {
     model?: Models;
     watch?: boolean;
     manual?: boolean;
+    functions?: string[];
 }) => {
     const spinner = ora();
 
@@ -246,7 +247,7 @@ export const run = async (opts: {
     const convo = conversationState(conversationFile);
     await convo.load();
 
-    const functions = await includeFunctions('all');
+    const functions = await includeFunctions(opts.functions ?? 'all');
 
     await goToEndOfFile(conversationFile);
 
diff --git a/packages/refactor-bot/src/refactor/actions/acceptedEdit.ts b/packages/refactor-bot/src/refactor/actions/acceptedEdit.ts
index a4704eb..88b2697 100644
--- a/packages/refactor-bot/src/refactor/actions/acceptedEdit.ts
+++ b/packages/refactor-bot/src/refactor/actions/acceptedEdit.ts
@@ -1,7 +1,7 @@
 import type { z } from 'zod';
 
 import { declareAction } from '../../event-bus';
-import type { refactorResultSchema } from '../types';
+import type { refactorFileResultSchema } from '../types';
 
 /**
  * When result of a file edit is accepted, an accepted edit will
@@ -10,5 +10,5 @@ import type { refactorResultSchema } from '../types';
  */
 export const acceptedEdit = declareAction(
     'acceptedEdit',
-    (result: z.infer<typeof refactorResultSchema>) => result
+    (result: z.infer<typeof refactorFileResultSchema>) => result
 );
diff --git a/packages/refactor-bot/src/refactor/actions/discardedEdit.ts b/packages/refactor-bot/src/refactor/actions/discardedEdit.ts
index a9816b1..39b33f6 100644
--- a/packages/refactor-bot/src/refactor/actions/discardedEdit.ts
+++ b/packages/refactor-bot/src/refactor/actions/discardedEdit.ts
@@ -1,12 +1,12 @@
 import type { z } from 'zod';
 
 import { declareAction } from '../../event-bus';
-import type { refactorResultSchema } from '../types';
+import type { refactorFileResultSchema } from '../types';
 
 /**
  * When result of a file edit is discarded
  */
 export const discardedEdit = declareAction(
     'discardedEdit',
-    (result: z.infer<typeof refactorResultSchema>) => result
+    (result: z.infer<typeof refactorFileResultSchema>) => result
 );
diff --git a/packages/refactor-bot/src/refactor/check.ts b/packages/refactor-bot/src/refactor/check.ts
index 12badff..359ba19 100644
--- a/packages/refactor-bot/src/refactor/check.ts
+++ b/packages/refactor-bot/src/refactor/check.ts
@@ -53,7 +53,11 @@ export const check = makeCachedFunction({
         );
         return {
             location: opts.location,
-            issues: results.reduce((acc, result) => acc.concat(result), []),
+            commands: results.map((result) => result.args.join(' ')),
+            issues: results.reduce(
+                (acc, result) => acc.concat(result.issues),
+                [] as (typeof results)[number]['issues']
+            ),
             checkedFiles: opts.filePaths,
         };
     },
@@ -140,6 +144,7 @@ export const checksSummary = (opts: {
         newIssues,
         remainingIssues,
         resolvedIssues,
+        commands: opts.checkResult.commands,
         totalNumberOfIssues: newIssues.length + remainingIssues.length,
     };
 };
diff --git a/packages/refactor-bot/src/refactor/checkoutSandbox.ts b/packages/refactor-bot/src/refactor/checkoutSandbox.ts
index 79f7402..eee1b19 100644
--- a/packages/refactor-bot/src/refactor/checkoutSandbox.ts
+++ b/packages/refactor-bot/src/refactor/checkoutSandbox.ts
@@ -78,11 +78,6 @@ export const checkoutSandbox = makeCachedFunction({
         } else {
             logger.trace(`Creating sandbox from "${root}"`);
 
-            /**
-             * @todo: this might copy some files that might be
-             * sensitive, we should probably introduce a way to
-             * ignore some files (ie .env)
-             */
             await createSandbox({
                 tag: config.name,
                 source: root,
diff --git a/packages/refactor-bot/src/refactor/cli.ts b/packages/refactor-bot/src/refactor/cli.ts
index f2d7b35..af59c7b 100644
--- a/packages/refactor-bot/src/refactor/cli.ts
+++ b/packages/refactor-bot/src/refactor/cli.ts
@@ -22,24 +22,37 @@ export const refactorCommand: CommandModule<
                 type: 'string',
                 describe: line`
                     Unique id of the refactoring that was previously run but
-                    didn't finish to start from last successful point
+                    didn't finish - use this to start from last successful step
                 `,
             })
             .option('save-to-cache', {
                 type: 'boolean',
                 describe: line`
                     Whether to enable saving results to the cache, by default
-                    it's enabled
+                    it's enabled.
                 `,
                 default: true,
+                hidden: true,
             })
             .option('enable-cache-for', {
                 type: 'string',
                 array: true,
                 describe: line`
-                    Enable cache for specific steps only, can be useful if we
-                    want to disable cache for all other steps and replay them
+                    Enable cache for specific steps - you can specify the name
+                    of the step or a name followed by a hash of the cache entry.
+                    This is for debugging purposes only.
                 `,
+                hidden: true,
+            })
+            .option('disable-cache-for', {
+                type: 'string',
+                array: true,
+                describe: line`
+                    Disable cache for specific steps - you can specify the name
+                    of the step or a name followed by a hash of the cache entry.
+                    This is for debugging purposes only.
+                `,
+                hidden: true,
             })
             .option('costs', {
                 type: 'boolean',
diff --git a/packages/refactor-bot/src/refactor/collectLlmUsage.ts b/packages/refactor-bot/src/refactor/collectLlmUsage.ts
new file mode 100644
index 0000000..70bc800
--- /dev/null
+++ b/packages/refactor-bot/src/refactor/collectLlmUsage.ts
@@ -0,0 +1,145 @@
+import type { ObservedValueOf } from 'rxjs';
+import { filter, type Observable, scan, startWith } from 'rxjs';
+import type { z } from 'zod';
+
+import { explainCacheKey } from '../cache/cache';
+import { calculatePrice } from '../chat-gpt/api';
+import { actions, type AnyAction } from '../event-bus';
+import { ofTypes } from '../event-bus/operators';
+import { gptRequestSuccess } from './actions/gptRequestSuccess';
+import type { llmUsageEntrySchema } from './types';
+
+type Usage = z.output<typeof llmUsageEntrySchema>;
+
+export const collectLlmUsage =
+    (opts?: { key?: string }) => (input: Observable<AnyAction>) =>
+        input.pipe(
+            ofTypes(gptRequestSuccess),
+            filter((event) => {
+                if (!opts?.key) {
+                    return true;
+                }
+
+                if (event.data.key?.startsWith(opts.key)) {
+                    return true;
+                }
+                return false;
+            }),
+            scan((usageByStep, event) => {
+                const steps = explainCacheKey(event.data.key) ?? [
+                    { name: 'unknown', hash: 'unknown' },
+                ];
+
+                const usage = event.data.response.usage;
+
+                usageByStep.push({
+                    model: event.data.model,
+                    steps: steps.map((x) => x.name),
+                    usage,
+                });
+
+                return usageByStep;
+            }, [] as Array<Usage>),
+            startWith([] as Array<Usage>)
+        );
+
+export function startCollectingLlmUsage(
+    opts?: { key?: string },
+    deps = { actions }
+) {
+    let result: ObservedValueOf<ReturnType<ReturnType<typeof collectLlmUsage>>>;
+    const subscription = deps
+        .actions()
+        .pipe(collectLlmUsage(opts))
+        .subscribe({
+            next: (usage) => {
+                result = usage;
+            },
+        });
+
+    return {
+        getUsage() {
+            return result;
+        },
+        finishCollecting() {
+            subscription.unsubscribe();
+            return result;
+        },
+    };
+}
+
+export function summarizeLlmUsagePrice(params: { usage: Array<Usage> }) {
+    const priceBySteps = new Map<
+        string,
+        {
+            promptPrice: number;
+            completionPrice: number;
+            totalPrice: number;
+        }
+    >();
+
+    let totalPrice = 0;
+
+    for (const { model, usage, steps } of params.usage) {
+        const price = calculatePrice({ model, usage });
+
+        for (const step of steps) {
+            const current = priceBySteps.get(step) ?? {
+                promptPrice: 0,
+                completionPrice: 0,
+                totalPrice: 0,
+            };
+
+            priceBySteps.set(step, {
+                promptPrice: current.promptPrice + price.promptPrice,
+                completionPrice:
+                    current.completionPrice + price.completionPrice,
+                totalPrice: current.totalPrice + price.totalPrice,
+            });
+        }
+
+        totalPrice += price.totalPrice;
+    }
+
+    return {
+        priceBySteps,
+        totalPrice,
+    };
+}
+
+export function summarizeLlmUsageTokens(params: { usage: Array<Usage> }) {
+    const tokensBySteps = new Map<
+        string,
+        {
+            promptTokens: number;
+            completionTokens: number;
+            totalTokens: number;
+        }
+    >();
+
+    let totalTokens = 0;
+
+    for (const { usage, steps } of params.usage) {
+        for (const step of steps) {
+            const current = tokensBySteps.get(step) ?? {
+                promptTokens: 0,
+                completionTokens: 0,
+                totalTokens: 0,
+            };
+
+            tokensBySteps.set(step, {
+                promptTokens: current.promptTokens + usage.promptTokens,
+                completionTokens:
+                    current.completionTokens + usage.completionTokens,
+                totalTokens: current.totalTokens + usage.totalTokens,
+            });
+        }
+
+        totalTokens += usage.totalTokens;
+    }
+
+    return {
+        tokensBySteps,
+        totalTokens,
+    };
+}
diff --git a/packages/refactor-bot/src/refactor/determineModelParameters.ts b/packages/refactor-bot/src/refactor/determineModelParameters.ts
index 867306c..04e8e8e 100644
--- a/packages/refactor-bot/src/refactor/determineModelParameters.ts
+++ b/packages/refactor-bot/src/refactor/determineModelParameters.ts
@@ -1,18 +1,29 @@
 import micromatch from 'micromatch';
-import type { z } from 'zod';
+import { z } from 'zod';
 
 import { logger } from '../logger/logger';
 import { hasOneElement } from '../utils/hasOne';
-import type { refactorConfigSchema } from './types';
+import { refactorConfigSchema } from './types';
 
-type ModelOpts = Pick<
-    z.output<typeof refactorConfigSchema>,
-    'model' | 'modelByStepCode' | 'useMoreExpensiveModelsOnRetry'
-> & {
-    attempt?: number;
-};
+export const refactorConfigModelParamsSchema = refactorConfigSchema
+    .pick({
+        model: true,
+        modelByStepCode: true,
+        useMoreExpensiveModelsOnRetry: true,
+    })
+    .augment({
+        attempt: z.number().optional(),
+    });
+
+export type RefactorConfigModelParams = z.input<
+    typeof refactorConfigModelParamsSchema
+>;
 
-function determineModel(input: ModelOpts, ctx?: { location?: string }) {
+function determineModel(
+    inputRaw: RefactorConfigModelParams,
+    ctx?: { location?: string }
+) {
+    const input = refactorConfigModelParamsSchema.parse(inputRaw);
     if (ctx?.location) {
         const location = ctx.location;
         const matchingKeys = [...Object.keys(input.modelByStepCode)].filter(
@@ -30,9 +41,10 @@ function determineModel(input: ModelOpts, ctx?: { location?: string }) {
 }
 
 export function determineModelParameters(
-    input: ModelOpts,
+    inputRaw: RefactorConfigModelParams,
     ctx?: { location?: string }
 ) {
+    const input = refactorConfigModelParamsSchema.parse(inputRaw);
     const model = determineModel(input, ctx);
 
     const result =
diff --git a/packages/refactor-bot/src/refactor/filesToEdit.ts b/packages/refactor-bot/src/refactor/filesToEdit.ts
new file mode 100644
index 0000000..a6166ec
--- /dev/null
+++ b/packages/refactor-bot/src/refactor/filesToEdit.ts
@@ -0,0 +1,110 @@
+import { z } from 'zod';
+
+import { makeCachedFunction } from '../cache/makeCachedFunction';
+import { markdown } from '../markdown/markdown';
+import { format } from '../text/format';
+import { validateAndParseListOfFiles } from './parsers/validateAndParseListOfFiles';
+import {
+    prompt,
+    promptParametersFrom,
+    refactorConfigPromptOptsSchema,
+} from './prompt';
+
+export const determineFilesToEditInputSchema =
+    refactorConfigPromptOptsSchema.augment({
+        objective: z.string(),
+    });
+
+export const determineFilesToEditResultSchema = z.object({
+    /**
+     * List of files allowed to be edited or refactored when mentioned in the
+     * objective, in the same order as mentioned in the objective.
+     *
+     * If the objective does not mention any files, this field is empty.
+     */
+    filesToEdit: z.array(z.string()),
+});
+
+const systemPrompt = markdown`
+    Think step by step. Be concise and to the point. Do not make assumptions and
+    follow instructions exactly.
+`;
+
+const promptText = (opts: { objective: string }) =>
+    format(
+        markdown`
+            <objective>
+            %objective%
+            </objective>
+
+            Given the above objective that implies modifications to source code,
+            determine the list of files the user wants us to edit or refactor.
+
+            1. Check the objective for mentions of files to be changed, edited
+               or refactored. Could be a single file or multiple files.
+
+            2. If there is no mention of files respond "No mention of files to
+               be edited or refactored".
+
+            3. If the resulting list from step #1 is not empty, format it -
+               return a numbered bullet list as shown below. File paths should
+               be surrounded by a backtick. Retain the order of files as they
+               appear in the objective.
+
+            <example>
+            #. \`path/to/file.ts\`
+            #. \`path/to/another/file.ts\`
+            </example>
+        `,
+        {
+            objective: opts.objective,
+        }
+    );
+
+export const determineFilesToEdit = makeCachedFunction({
+    name: 'files-to-edit',
+    inputSchema: determineFilesToEditInputSchema,
+    resultSchema: determineFilesToEditResultSchema,
+    transform: async (input, ctx) => {
+        const promptParams = promptParametersFrom(
+            {
+                ...input,
+                /**
+                 * @note we do not want to allow any functions for this prompt
+                 */
+                allowedFunctions: [],
+            },
+            ctx
+        );
+
+        const allowedFilesResult = await prompt(
+            {
+                preface: systemPrompt,
+                prompt: promptText({
+                    objective: input.objective,
+                }),
+                temperature: 0.2,
+                ...promptParams,
+                shouldStop: async (message) => {
+                    await validateAndParseListOfFiles({
+                        sandboxDirectoryPath: input.sandboxDirectoryPath,
+                        text: message.content,
+                        sortBySize: false,
+                    });
+                    return true as const;
+                },
+            },
+            ctx
+        );
+
+        const filesToEdit = await validateAndParseListOfFiles({
+            sandboxDirectoryPath: input.sandboxDirectoryPath,
+            text: allowedFilesResult.choices[0].resultingMessage.content,
+            sortBySize: false,
+        });
+
+        return {
+            filesToEdit,
+        };
+    },
+});
diff --git a/packages/refactor-bot/src/refactor/findRefactorStateLocation.ts b/packages/refactor-bot/src/refactor/findRefactorStateLocation.ts
new file mode 100644
index 0000000..06bddc5
--- /dev/null
+++ b/packages/refactor-bot/src/refactor/findRefactorStateLocation.ts
@@ -0,0 +1,44 @@
+import { globby } from 'globby';
+import { basename, dirname } from 'path';
+
+import { ConfigurationError } from '../errors/configurationError';
+import { findRepositoryRoot } from '../file-system/findRepositoryRoot';
+import { hasOneElement } from '../utils/hasOne';
+
+/**
+ * Finds the location of the state files for a given refactor id, the state
+ * files will store temporary information about the refactor, cached data and
+ * other information that is not needed to be stored in the git repository.
+ */
+export async function findRefactorStateLocation(opts: {
+    id: string;
+    location?: string;
+}) {
+    if (!opts.id) {
+        throw new Error('id is required');
+    }
+
+    const repoRoot = await findRepositoryRoot(opts.location);
+
+    const location = await globby(
+        `.refactor-bot/refactors/*/state/${opts.id}/`,
+        {
+            cwd: repoRoot,
+            onlyDirectories: true,
+        }
+    );
+
+    if (!hasOneElement(location)) {
+        throw new ConfigurationError(
+            `Cannot find files to load state from for id "${opts.id}"`
+        );
+    }
+
+    const name = basename(dirname(dirname(location[0])));
+
+    return {
+        id: opts.id,
+        name,
+        location: location[0],
+    };
+}
diff --git a/packages/refactor-bot/src/refactor/planAndRefactor.ts b/packages/refactor-bot/src/refactor/planAndRefactor.ts
index 982283b..f403985 100644
--- a/packages/refactor-bot/src/refactor/planAndRefactor.ts
+++ b/packages/refactor-bot/src/refactor/planAndRefactor.ts
@@ -2,19 +2,17 @@ import { z } from 'zod';
 
 import { CycleDetectedError } from '../errors/cycleDetectedError';
 import { logger } from '../logger/logger';
+import { line } from '../text/line';
 import { scriptSchema } from './check';
 import { planFiles, planFilesResultSchema } from './planFiles';
 import { refactorBatch } from './refactorBatch';
 import { resetToLastAcceptedCommit } from './resetToLastAcceptedCommit';
 import type { RefactorFilesResult } from './types';
-import {
-    mutateToMergeRefactorFilesResults,
-    refactorConfigSchema,
-    refactorFilesResultSchema,
-} from './types';
+import { refactorConfigSchema, refactorFilesResultSchema } from './types';
 
 export const planAndRefactorInputSchema = refactorConfigSchema.augment({
     objective: z.string(),
+    requirements: z.array(z.string()).nonempty(),
     startCommit: z.string(),
     sandboxDirectoryPath: z.string(),
     scripts: z.array(scriptSchema),
@@ -32,8 +30,8 @@ export const planAndRefactor = async (
     input: z.input<typeof planAndRefactorInputSchema>
 ) => {
     const files: RefactorFilesResult = {
-        accepted: {},
-        discarded: {},
+        accepted: [],
+        discarded: [],
     };
 
     const planFilesResults: Array<z.output<typeof planFilesResultSchema>> = [];
@@ -45,7 +43,7 @@ export const planAndRefactor = async (
         rawResponse: planResult.rawResponse,
     });
 
-    const { plannedFiles } = planResult;
+    const plannedFiles = [...planResult.plannedFiles];
 
     while (plannedFiles.length > 0) {
         const result = await refactorBatch({
@@ -58,10 +56,8 @@ export const planAndRefactor = async (
             result,
         });
 
-        mutateToMergeRefactorFilesResults({
-            from: result,
-            into: files,
-        });
+        files.accepted.push(...result.accepted);
+        files.discarded.push(...result.discarded);
 
         const repeatedPlanResult = await planFiles(input).catch((err) => {
             if (err instanceof CycleDetectedError) {
@@ -71,7 +67,11 @@ export const planAndRefactor = async (
                  * list instead.
                  */
                 logger.warn(
-                    'Cycle detected when planning files to change, this is likely result of the last batch of changes not producing any changes.',
+                    line`
+                        Cycle detected when planning files to change, this is
+                        likely result of the last batch of file edits not
+                        producing any changes
+                    `,
                     {
                         error: err,
                         result,
diff --git a/packages/refactor-bot/src/refactor/prompt.ts b/packages/refactor-bot/src/refactor/prompt.ts
index 1cae15c..0b47d35 100644
--- a/packages/refactor-bot/src/refactor/prompt.ts
+++ b/packages/refactor-bot/src/refactor/prompt.ts
@@ -1,11 +1,20 @@
 import hash from 'object-hash';
 import type { Observable } from 'rxjs';
-import { defer, EMPTY, from, lastValueFrom } from 'rxjs';
-import { expand, mergeAll, mergeMap, toArray } from 'rxjs/operators';
+import { defer, EMPTY, from, lastValueFrom, of } from 'rxjs';
+import {
+    catchError,
+    expand,
+    map,
+    mergeAll,
+    mergeMap,
+    switchMap,
+    toArray,
+} from 'rxjs/operators';
 import type { TypeOf } from 'zod';
 import { z } from 'zod';
 
 import { makeCachedFunction } from '../cache/makeCachedFunction';
+import type { CacheStateRef } from '../cache/types';
 import type { Message } from '../chat-gpt/api';
 import {
     calculatePrice,
@@ -27,6 +36,11 @@ import { ensureHasOneElement } from '../utils/hasOne';
 import { isTruthy } from '../utils/isTruthy';
 import { gptRequestFailed } from './actions/gptRequestFailed';
 import { gptRequestSuccess } from './actions/gptRequestSuccess';
+import {
+    determineModelParameters,
+    refactorConfigModelParamsSchema,
+} from './determineModelParameters';
+import { refactorConfigSchema } from './types';
 
 export const promptInputSchema = z.object({
     preface: z.string().optional(),
@@ -46,9 +60,12 @@ export const promptInputSchema = z.object({
         .optional(),
     model: modelsSchema.optional().default('gpt-3.5-turbo'),
     choices: z.number().optional(),
+    dedupe: z.boolean().optional(),
+    seed: z.string().optional(),
 });
 
 export const promptResultSchema = z.object({
+    key: z.string().optional(),
     choices: z
         .array(
             z.object({
@@ -210,6 +227,39 @@ function removeFunctionFromState(
     });
 }
 
+export type RefactorConfigPromptOpts = z.input<
+    typeof refactorConfigPromptOptsSchema
+>;
+
+export const refactorConfigPromptOptsSchema = refactorConfigSchema
+    .pick({
+        budgetCents: true,
+        scope: true,
+        tsConfigJsonFileName: true,
+        allowedFunctions: true,
+    })
+    .merge(refactorConfigModelParamsSchema)
+    .augment({
+        sandboxDirectoryPath: z.string(),
+    });
+
+export const promptParametersFrom = (
+    inputRaw: RefactorConfigPromptOpts,
+    ctx?: CacheStateRef
+): Omit<z.input<typeof promptInputSchema>, 'temperature' | 'prompt'> => {
+    const input = refactorConfigPromptOptsSchema.parse(inputRaw);
+    return {
+        budgetCents: input.budgetCents,
+        functionsConfig: {
+            repositoryRoot: input.sandboxDirectoryPath,
+            scope: input.scope,
+            tsConfigJsonFileName: input.tsConfigJsonFileName,
+            allowedFunctions: input.allowedFunctions,
+        },
+        ...determineModelParameters(input, ctx),
+    };
+};
+
 export const prompt = makeCachedFunction({
     name: 'prompt',
     inputSchema: promptInputSchema,
@@ -234,7 +284,7 @@ export const prompt = makeCachedFunction({
         };
 
         const stream = from([initialState()]).pipe(
-            expand((state, i) => {
+            expand((state) => {
                 const lastMessage = state.messages[state.messages.length - 1];
                 if (!lastMessage) {
                     throw new Error('Invalid state, no last message found');
@@ -254,18 +304,13 @@ export const prompt = makeCachedFunction({
                                     opts.functionsConfig.allowedFunctions,
                                 model: opts.model,
                                 temperature: opts.temperature,
-                                /**
-                                 * @note if this is the first prompt, we
-                                 * ask for multiple choices answer to start
-                                 * multiple branches of conversation
-                                 */
-                                ...(i === 0 && {
-                                    choices: opts.choices,
-                                }),
+                                choices: opts.choices,
                             },
                             ctx
                         );
 
+                        const shouldDedupe = opts.dedupe ?? false;
+
                         const unique = new Map(
                             result.response.choices.map(
                                 (choice) =>
@@ -273,12 +318,15 @@ export const prompt = makeCachedFunction({
                             )
                         );
 
-                        const nextStateChoices = [...unique.values()].map(
-                            (choice) => ({
-                                status: choice.finishReason,
-                                messages: [...state.messages, choice.message],
-                            })
-                        );
+                        const nextStateChoices = shouldDedupe
+                            ? [...unique.values()].map((choice) => ({
+                                  status: choice.finishReason,
+                                  messages: [...state.messages, choice.message],
+                              }))
+                            : result.response.choices.map((choice) => ({
+                                  status: choice.finishReason,
+                                  messages: [...state.messages, choice.message],
+                              }));
 
                         if (
                             nextStateChoices.length === 1 &&
@@ -337,9 +385,9 @@ export const prompt = makeCachedFunction({
                         );
 
                         return defer(async () => {
-                            const result = await Promise.resolve(
-                                shouldStop(lastMessage)
-                            ).catch((e) =>
+                            const result = await new Promise((resolve) => {
+                                resolve(shouldStop(lastMessage));
+                            }).catch((e) =>
                                 e instanceof Error ? e.message : String(e)
                             );
 
@@ -370,24 +418,52 @@ export const prompt = makeCachedFunction({
                 }
             }),
             mergeMap((state) => {
-                // everything that gets into expand gets out of it, so we must
-                // filter out the initial state
+                // everything that gets into expand and what gets out of it
+                // will be result of the expand operator, so we must
+                // filter out only the stop messages that are not failed
                 if (state.status !== 'stop') {
                     return EMPTY;
                 }
 
-                return [
-                    {
-                        resultingMessage: regularAssistantMessageSchema.parse(
+                const shouldStop = opts.shouldStop;
+                if (shouldStop) {
+                    // we also need to filter out the stop messages that are
+                    // failed when shouldStop was called with their content
+                    return defer(async () => {
+                        const lastMessage = regularAssistantMessageSchema.parse(
                             state.messages[state.messages.length - 1],
                             {
                                 errorMap: () => ({
                                     message: `Invalid algorithm, the last message in conversation doesn't conform to the expected schema`,
                                 }),
                             }
-                        ),
-                    },
-                ];
+                        );
+
+                        const shouldUseStopMessage =
+                            await shouldStop(lastMessage);
+
+                        return shouldUseStopMessage === true;
+                    }).pipe(
+                        catchError(() => of(false)),
+                        switchMap((shouldUseStopMessage) =>
+                            shouldUseStopMessage ? of(state) : EMPTY
+                        )
+                    );
+                }
+
+                return of(state);
+            }),
+            map((state) => {
+                return {
+                    resultingMessage: regularAssistantMessageSchema.parse(
+                        state.messages[state.messages.length - 1],
+                        {
+                            errorMap: () => ({
+                                message: `Invalid algorithm, the last message in conversation doesn't conform to the expected schema`,
+                            }),
+                        }
+                    ),
+                };
             }),
             toArray()
         );
@@ -395,6 +471,7 @@ export const prompt = makeCachedFunction({
         const choices = ensureHasOneElement(await lastValueFrom(stream));
 
         return {
+            key: ctx.location,
             choices,
         };
     },
diff --git a/packages/refactor-bot/src/refactor/refactor.ts b/packages/refactor-bot/src/refactor/refactor.ts
index 6de06d8..fc41bc4 100644
--- a/packages/refactor-bot/src/refactor/refactor.ts
+++ b/packages/refactor-bot/src/refactor/refactor.ts
@@ -1,3 +1,5 @@
+import { writeFile } from 'fs/promises';
+import { dump } from 'js-yaml';
 import { join } from 'path';
 
 import { createCachedPipeline } from '../cache/state';
@@ -13,19 +15,41 @@ import { randomText } from '../utils/randomText';
 import { checkoutSandbox } from './checkoutSandbox';
 import { enrichObjective } from './enrichObjective';
 import { refactorGoal } from './refactorGoal';
-import { resultsCollector } from './resultsCollector';
+import {
+    collectedRefactorResultSchema,
+    resultsCollector,
+} from './resultsCollector';
 import { retrieveParameters } from './retrieveParameters';
 import { type RefactorConfig } from './types';
 
-const createPipeline = (opts: {
-    location: string;
-    saveToCache: boolean;
+export async function refactor(opts: {
+    config: RefactorConfig;
+    saveToCache?: boolean;
     enableCacheFor?: string[];
-}) =>
-    createCachedPipeline({
-        location: opts.location,
+    disableCacheFor?: string[];
+}) {
+    if (
+        opts.enableCacheFor &&
+        opts.enableCacheFor.filter(Boolean).length === 0
+    ) {
+        throw new Error('enableCacheFor cannot be empty');
+    }
+
+    const root = process.env['CACHE_ROOT'] ?? (await findRepositoryRoot());
+
+    const id = opts.config.id ?? randomText(8);
+
+    const location = join(
+        root,
+        `.refactor-bot/refactors/${opts.config.name}/state/`,
+        id
+    );
+
+    const { execute, abort } = createCachedPipeline({
+        location,
         enableCacheFor: opts.enableCacheFor,
-        saveToCache: opts.saveToCache,
+        saveToCache: opts.saveToCache ?? true,
+        disableCacheFor: opts.disableCacheFor,
         pipeline: async (input: RefactorConfig) => {
             const checkoutResult = await checkoutSandbox(input);
 
@@ -58,46 +82,6 @@ const createPipeline = (opts: {
         },
     });
 
-async function loadRefactorState(opts: {
-    config: RefactorConfig;
-    saveToCache?: boolean;
-    enableCacheFor?: string[];
-}) {
-    const root = await findRepositoryRoot();
-
-    const id = opts.config.id ?? randomText(8);
-
-    const location = join(
-        root,
-        `.refactor-bot/refactors/${opts.config.name}/state/`,
-        id
-    );
-
-    return {
-        ...createPipeline({
-            location,
-            saveToCache: opts.saveToCache ?? true,
-            enableCacheFor: opts.enableCacheFor,
-        }),
-        location,
-        id,
-    };
-}
-
-export async function refactor(opts: {
-    config: RefactorConfig;
-    saveToCache?: boolean;
-    enableCacheFor?: string[];
-}) {
-    if (
-        opts.enableCacheFor &&
-        opts.enableCacheFor.filter(Boolean).length === 0
-    ) {
-        throw new Error('enableCacheFor cannot be empty');
-    }
-
-    const { execute, abort, id } = await loadRefactorState(opts);
-
     logger.info(
         format(
             line`
@@ -152,7 +136,13 @@ export async function refactor(opts: {
         teardown();
     }
 
-    const result = finalizeResults(error);
+    const result = finalizeResults(opts.config, error);
+
+    await writeFile(
+        join(location, 'result.yaml'),
+        dump(collectedRefactorResultSchema.parse(result)),
+        'utf-8'
+    );
 
     const lastCommit = await gitRevParse({
         location: result.sandboxDirectoryPath,
diff --git a/packages/refactor-bot/src/refactor/refactorBatch.ts b/packages/refactor-bot/src/refactor/refactorBatch.ts
index 78aed6a..d30e8cc 100644
--- a/packages/refactor-bot/src/refactor/refactorBatch.ts
+++ b/packages/refactor-bot/src/refactor/refactorBatch.ts
@@ -1,8 +1,7 @@
+import assert from 'assert';
 import { z } from 'zod';
 
-import { AbortError } from '../errors/abortError';
-import { CycleDetectedError } from '../errors/cycleDetectedError';
-import { OutOfContextBoundsError } from '../errors/outOfContextBoundsError';
+import { evaluateFileScore } from '../evaluate/evaluateFileScore';
 import { dispatch } from '../event-bus';
 import { gitResetHard } from '../git/gitResetHard';
 import { gitRevParse } from '../git/gitRevParse';
@@ -11,12 +10,8 @@ import { acceptedEdit } from './actions/acceptedEdit';
 import { discardedEdit } from './actions/discardedEdit';
 import { scriptSchema } from './check';
 import { refactorFile } from './refactorFile';
-import type { RefactorFileResult, RefactorFilesResult } from './types';
-import {
-    pushRefactorFileResults,
-    refactorConfigSchema,
-    refactorResultSchema,
-} from './types';
+import type { RefactorFilesResult } from './types';
+import { refactorConfigSchema } from './types';
 
 export const refactorBatchInputSchema = refactorConfigSchema
     .pick({
@@ -27,21 +22,15 @@ export const refactorBatchInputSchema = refactorConfigSchema
         modelByStepCode: true,
         useMoreExpensiveModelsOnRetry: true,
         allowedFunctions: true,
+        evaluate: true,
+        evaluateMinScore: true,
     })
     .augment({
         objective: z.string(),
+        requirements: z.array(z.string()).nonempty(),
         plannedFiles: z.array(z.string()),
         startCommit: z.string(),
         sandboxDirectoryPath: z.string(),
-        /**
-         * Controls whether to accept the result of a refactor or not.
-         */
-        shouldAcceptResult: z
-            .function(
-                z.tuple([refactorResultSchema]),
-                z.promise(z.boolean()).or(z.boolean())
-            )
-            .optional(),
         scripts: z.array(scriptSchema),
         prettierScriptLocation: z.string().optional(),
     });
@@ -52,12 +41,8 @@ export const refactorBatch = async (
 ) => {
     const { plannedFiles } = await refactorBatchInputSchema.parseAsync(input);
 
-    const shouldAcceptResult =
-        input.shouldAcceptResult ??
-        ((result) => result.status === 'success' && Boolean(result.lastCommit));
-
-    const accepted: RefactorFilesResult['accepted'] = {};
-    const discarded: RefactorFilesResult['discarded'] = {};
+    const accepted: RefactorFilesResult['accepted'] = [];
+    const discarded: RefactorFilesResult['discarded'] = [];
 
     for (const filePath of plannedFiles) {
         const beforeRefactorCommit = await gitRevParse({
@@ -65,36 +50,32 @@ export const refactorBatch = async (
             ref: 'HEAD',
         });
 
-        const { file } = await refactorFile({
+        const refactorFileResult = await refactorFile({
             filePath,
             ...input,
-        }).catch((err) => {
-            if (
-                err instanceof AbortError &&
-                !(err instanceof CycleDetectedError) &&
-                !(err instanceof OutOfContextBoundsError)
-            ) {
-                return Promise.reject(err);
-            }
-
-            /**
-             * @note this is temporary, ideally the refactorFile
-             * function should not throw and return a failure result
-             */
-            return {
-                file: {
-                    status: 'failure',
-                    failureDescription:
-                        err instanceof Error ? err.message : String(err),
-                    filePath,
-                    issues: [],
-                    steps: [],
-                    timestamp: performance.now(),
-                },
-            } as RefactorFileResult;
         });
 
-        const shouldAccept = await Promise.resolve(shouldAcceptResult(file));
+        const { file } = refactorFileResult;
+
+        let shouldAccept =
+            file.status === 'success' && Boolean(file.lastCommit);
+
+        if (shouldAccept && input.evaluate) {
+            const lastCommit = file.lastCommit;
+            assert(lastCommit);
+
+            refactorFileResult.evaluation = await evaluateFileScore({
+                ...input,
+                filePath: file.filePath,
+                commitBeforeChanges: beforeRefactorCommit,
+                commit: lastCommit,
+                choices: 3,
+            });
+
+            shouldAccept =
+                refactorFileResult.evaluation.score >=
+                (input.evaluateMinScore ?? 0.5);
+        }
 
         if (shouldAccept) {
             if (file.lastCommit) {
@@ -112,12 +93,10 @@ export const refactorBatch = async (
                     });
                 }
             }
-            pushRefactorFileResults({
-                into: accepted,
-                result: file,
-            });
 
-            deps.dispatch(acceptedEdit(file));
+            accepted.push(refactorFileResult);
+
+            deps.dispatch(acceptedEdit(refactorFileResult));
         } else {
             const currentCommit = await gitRevParse({
                 location: input.sandboxDirectoryPath,
@@ -136,14 +115,9 @@ export const refactorBatch = async (
                 });
             }
 
-            if (file.status === 'failure') {
-                pushRefactorFileResults({
-                    into: discarded,
-                    result: file,
-                });
-            }
+            discarded.push(refactorFileResult);
 
-            deps.dispatch(discardedEdit(file));
+            deps.dispatch(discardedEdit(refactorFileResult));
         }
     }
 
diff --git a/packages/refactor-bot/src/refactor/refactorFile.ts b/packages/refactor-bot/src/refactor/refactorFile.ts
index 3457bff..20fc14d 100644
--- a/packages/refactor-bot/src/refactor/refactorFile.ts
+++ b/packages/refactor-bot/src/refactor/refactorFile.ts
@@ -7,6 +7,7 @@ import { z } from 'zod';
 import { makeCachedFunction } from '../cache/makeCachedFunction';
 import { AbortError } from '../errors/abortError';
 import { CycleDetectedError } from '../errors/cycleDetectedError';
+import { OutOfContextBoundsError } from '../errors/outOfContextBoundsError';
 import { gitDiffRange } from '../git/gitDiffRange';
 import { gitFilesDiff } from '../git/gitFilesDiff';
 import { gitResetHard } from '../git/gitResetHard';
@@ -19,6 +20,7 @@ import { ensureHasOneElement, hasTwoElements } from '../utils/hasOne';
 import { UnreachableError } from '../utils/UnreachableError';
 import { applyChanges } from './applyChanges';
 import { check, checksSummary, scriptSchema } from './check';
+import { startCollectingLlmUsage } from './collectLlmUsage';
 import { edit } from './edit';
 import { formatCommitMessage } from './prompts/formatCommitMessage';
 import { formatFileContents } from './prompts/formatFileContents';
@@ -164,353 +166,427 @@ export const refactorFile = makeCachedFunction({
     transform: async (input, ctx) => {
         const { filePath, sandboxDirectoryPath } = input;
 
-        const packageManager = await determinePackageManager({
-            directory: input.sandboxDirectoryPath,
+        const refactorFileLlmUsage = startCollectingLlmUsage({
+            key: ctx.location,
         });
 
-        const steps = new Array<{
-            commit: string;
-            task: 'refactor' | 'fix-issues' | 'fix-export-issues';
-            key: string;
-            patch: string;
-            fileContents: string;
-            checkSummary: ReturnType<typeof checksSummary>;
-            timestamp: number;
-        }>();
-
-        const initialCheck = await check(
-            {
-                packageManager,
-                location: input.sandboxDirectoryPath,
-                startCommit: input.startCommit,
-                filePaths: [filePath],
-                scripts: input.scripts,
-            },
-            ctx
-        );
-        if (initialCheck.issues.length > 0) {
-            throw new AbortError(
-                'We should have no errors initially, this should be guaranteed by the initial pre-check made in refactorGoal function. If we got here there must be a bug in the code.'
+        try {
+            const packageManager = await determinePackageManager({
+                directory: input.sandboxDirectoryPath,
+            });
+
+            const steps = new Array<{
+                commit?: string;
+                task: 'refactor' | 'fix-issues' | 'fix-export-issues';
+                key: string;
+                patch?: string;
+                fileContents: string;
+                checkSummary?: ReturnType<typeof checksSummary>;
+                timestamp: number;
+            }>();
+
+            const initialCheck = await check(
+                {
+                    packageManager,
+                    location: input.sandboxDirectoryPath,
+                    startCommit: input.startCommit,
+                    filePaths: [filePath],
+                    scripts: input.scripts,
+                },
+                ctx
             );
-        }
+            if (initialCheck.issues.length > 0) {
+                throw new AbortError(
+                    'We should have no errors initially, this should be guaranteed by the initial pre-check made in refactorGoal function. If we got here there must be a bug in the code.'
+                );
+            }
 
-        const issues: Issue[] = [];
-        const commonEditOpts = {
-            ...input,
-            filePath,
-            sandboxDirectoryPath: input.sandboxDirectoryPath,
-            budgetCents: input.budgetCents,
-            eslintAutoFixScriptArgs: input.scripts.find((script) =>
-                script.args.includes('eslint')
-            )?.args,
-            prettierScriptLocation: input.prettierScriptLocation,
-        };
-
-        const advice: string[] = [];
-
-        const buildResult = () => {
-            return {
+            const issues: Issue[] = [];
+            const commonEditOpts = {
+                ...input,
                 filePath,
-                issues,
-                steps: steps.map((step) => ({
-                    commit: step.commit,
-                    fileContents: step.fileContents,
-                    task: step.task,
-                    timestamp: step.timestamp,
-                })),
-                lastCommit: lastCommit(steps),
-                timestamp: lastTimestamp(steps) ?? performance.now(),
+                sandboxDirectoryPath: input.sandboxDirectoryPath,
+                budgetCents: input.budgetCents,
+                eslintAutoFixScriptArgs: input.scripts.find((script) =>
+                    script.args.includes('eslint')
+                )?.args,
+                prettierScriptLocation: input.prettierScriptLocation,
             };
-        };
 
-        const startCommit = await gitRevParse({
-            location: input.sandboxDirectoryPath,
-            ref: 'HEAD',
-        });
-        const initialFileContents = await readFile(
-            join(input.sandboxDirectoryPath, filePath),
-            'utf-8'
-        );
-        do {
-            try {
-                const fileContents = await readFile(
-                    join(input.sandboxDirectoryPath, filePath),
-                    'utf-8'
-                );
+            const advice: string[] = [];
 
-                const fileDiff = await gitFilesDiff({
-                    location: input.sandboxDirectoryPath,
-                    filePaths: [filePath],
-                    ref: startCommit,
-                });
+            const buildResult = () => {
+                return {
+                    filePath,
+                    issues,
+                    steps: steps.map((step) => ({
+                        commit: step.commit,
+                        fileContents: step.fileContents,
+                        task: step.task,
+                        checkSummary: step.checkSummary,
+                        timestamp: step.timestamp,
+                    })),
+                    lastCommit: lastCommit(steps),
+                    timestamp: lastTimestamp(steps) ?? performance.now(),
+                };
+            };
 
-                const localIssues = issues.filter(
-                    (issue) => issue.filePath === filePath
-                );
+            const startCommit = await gitRevParse({
+                location: input.sandboxDirectoryPath,
+                ref: 'HEAD',
+            });
+            const initialFileContents = await readFile(
+                join(input.sandboxDirectoryPath, filePath),
+                'utf-8'
+            );
+            do {
+                try {
+                    const fileContents = await readFile(
+                        join(input.sandboxDirectoryPath, filePath),
+                        'utf-8'
+                    );
 
-                const externalIssues = issues.filter(
-                    (issue) => issue.filePath !== filePath
-                );
+                    const fileDiff = await gitFilesDiff({
+                        location: input.sandboxDirectoryPath,
+                        filePaths: [filePath],
+                        ref: startCommit,
+                    });
 
-                let task: 'refactor' | 'fix-issues' | 'fix-export-issues' =
-                    'refactor';
-                if (issues.length > 0) {
-                    if (localIssues.length > 0) {
-                        task = 'fix-issues';
-                    } else {
-                        task = 'fix-export-issues';
+                    const localIssues = issues.filter(
+                        (issue) => issue.filePath === filePath
+                    );
+
+                    const externalIssues = issues.filter(
+                        (issue) => issue.filePath !== filePath
+                    );
+
+                    let task: 'refactor' | 'fix-issues' | 'fix-export-issues' =
+                        'refactor';
+                    if (issues.length > 0) {
+                        if (localIssues.length > 0) {
+                            task = 'fix-issues';
+                        } else {
+                            task = 'fix-export-issues';
+                        }
                     }
-                }
 
-                let objective: string;
-                switch (task) {
-                    case 'refactor':
-                        objective = performFileRefactoringPromptText({
-                            fileContents,
-                            filePath,
-                            language: 'TypeScript',
-                            objective: input.objective,
-                        });
-                        break;
-                    case 'fix-issues':
-                        objective = fixLocalIssuesPromptText({
-                            fileContents,
-                            filePath,
-                            language: 'TypeScript',
-                            objective: input.objective,
-                            issues: localIssues.map(({ issue }) => issue),
-                            fileDiff,
-                            advice,
-                        });
-                        break;
-                    case 'fix-export-issues':
-                        objective = fixExportIssuesPromptText({
-                            fileContents,
-                            filePath,
-                            language: 'TypeScript',
-                            objective: input.objective,
-                            issues: externalIssues.map(({ issue }) => issue),
-                            fileDiff,
-                            advice,
-                            ...(await changeInfo({
-                                location: input.sandboxDirectoryPath,
-                                oldFileContents: initialFileContents,
-                                newFileContents: fileContents,
+                    let objective: string;
+                    switch (task) {
+                        case 'refactor':
+                            objective = performFileRefactoringPromptText({
+                                fileContents,
                                 filePath,
+                                language: 'TypeScript',
+                                objective: input.objective,
+                            });
+                            break;
+                        case 'fix-issues':
+                            objective = fixLocalIssuesPromptText({
+                                fileContents,
+                                filePath,
+                                language: 'TypeScript',
+                                objective: input.objective,
+                                issues: localIssues.map(({ issue }) => issue),
                                 fileDiff,
-                            })),
-                        });
-                        break;
-                    default:
-                        throw new UnreachableError(task);
-                }
-
-                const { choices } = await edit(
-                    {
-                        ...commonEditOpts,
-                        objective,
-                        fileContents,
-                    },
-                    ctx
-                );
-
-                const editResult = choices[0];
-
-                if (editResult.status !== 'success') {
-                    if (issues.length > 0) {
-                        const file = {
-                            status: 'failure' as const,
-                            failureDescription:
-                                'The model has failed to fix the issues in the file',
-                            ...buildResult(),
-                        };
-
-                        return {
-                            file,
-                        };
+                                advice,
+                            });
+                            break;
+                        case 'fix-export-issues':
+                            objective = fixExportIssuesPromptText({
+                                fileContents,
+                                filePath,
+                                language: 'TypeScript',
+                                objective: input.objective,
+                                issues: externalIssues.map(
+                                    ({ issue }) => issue
+                                ),
+                                fileDiff,
+                                advice,
+                                ...(await changeInfo({
+                                    location: input.sandboxDirectoryPath,
+                                    oldFileContents: initialFileContents,
+                                    newFileContents: fileContents,
+                                    filePath,
+                                    fileDiff,
+                                })),
+                            });
+                            break;
+                        default:
+                            throw new UnreachableError(task);
                     }
-                    continue;
-                }
 
-                const commitMessage =
-                    issues.length === 0
-                        ? formatCommitMessage({
-                              type: 'refactor',
-                              filePath,
-                              description: input.objective,
-                          })
-                        : formatCommitMessage({
-                              type: 'fix',
-                              filePath,
-                              description: issues
-                                  .map((issue) => issue.issue)
-                                  .join('\n'),
-                          });
-
-                const { commit } = await applyChanges({
-                    commitMessage,
-                    filePath,
-                    sandboxDirectoryPath,
-                    fileContents: editResult.fileContents,
-                    fileContentsHash: editResult.fileContentsHash,
-                });
-
-                const checkResult = await check(
-                    {
-                        packageManager,
-                        location: input.sandboxDirectoryPath,
-                        startCommit: input.startCommit,
-                        scripts: input.scripts,
-                    },
-                    ctx
-                );
+                    const { choices } = await edit(
+                        {
+                            ...commonEditOpts,
+                            objective,
+                            fileContents,
+                        },
+                        ctx
+                    );
 
-                const checkSummary = checksSummary({
-                    issues,
-                    checkResult,
-                    checkCommit: commit,
-                });
-
-                issues.splice(
-                    0,
-                    issues.length,
-                    ...checkSummary.newIssues,
-                    ...checkSummary.remainingIssues
-                );
+                    const editResult = choices[0];
+
+                    if (editResult.status !== 'success') {
+                        if (issues.length > 0) {
+                            const file = {
+                                status: 'failure' as const,
+                                failureDescription:
+                                    'The model has failed to fix the issues in the file',
+                                ...buildResult(),
+                            };
+
+                            return {
+                                ...(ctx.location && {
+                                    key: ctx.location,
+                                }),
+                                file,
+                                usage: refactorFileLlmUsage.finishCollecting(),
+                            };
+                        } else {
+                            const key = editResult.key;
+                            assert(key);
+                            steps.push({
+                                task,
+                                key,
+                                fileContents,
+                                timestamp: performance.now(),
+                            });
+                        }
+                        continue;
+                    }
 
-                const key = editResult.key;
-                assert(key);
+                    const commitMessage =
+                        issues.length === 0
+                            ? formatCommitMessage({
+                                  type: 'refactor',
+                                  filePath,
+                                  description: input.objective,
+                              })
+                            : formatCommitMessage({
+                                  type: 'fix',
+                                  filePath,
+                                  description: issues
+                                      .map((issue) => issue.issue)
+                                      .join('\n'),
+                              });
+
+                    const { commit } = await applyChanges({
+                        commitMessage,
+                        filePath,
+                        sandboxDirectoryPath,
+                        fileContents: editResult.fileContents,
+                        fileContentsHash: editResult.fileContentsHash,
+                    });
+
+                    const checkResult = await check(
+                        {
+                            packageManager,
+                            location: input.sandboxDirectoryPath,
+                            startCommit: input.startCommit,
+                            scripts: input.scripts,
+                        },
+                        ctx
+                    );
 
-                steps.push({
-                    task,
-                    key,
-                    patch: await gitFilesDiff({
-                        location: input.sandboxDirectoryPath,
-                        filePaths: [filePath],
-                        ref: lastCommit(steps) || startCommit,
-                    }),
-                    fileContents: editResult.fileContents,
-                    commit,
-                    checkSummary,
-                    timestamp: performance.now(),
-                });
-            } catch (exc) {
-                if (exc instanceof CycleDetectedError) {
-                    const error = exc;
-                    const index = steps.findIndex(
-                        (step) => step.key === error.key
+                    const checkSummary = checksSummary({
+                        issues,
+                        checkResult,
+                        checkCommit: commit,
+                    });
+
+                    issues.splice(
+                        0,
+                        issues.length,
+                        ...checkSummary.newIssues,
+                        ...checkSummary.remainingIssues
                     );
-                    const stepsLeadingToCycle = steps.slice(index);
-                    const leastProblematicStep = ensureHasOneElement(
-                        orderBy(
-                            stepsLeadingToCycle,
-                            (step) => step.checkSummary.totalNumberOfIssues,
-                            'asc'
-                        )
-                    )[0];
-                    const leastProblematicStepIndex =
-                        stepsLeadingToCycle.findIndex(
-                            (value) => value === leastProblematicStep
-                        );
 
-                    /**
-                     * @todo haven't seen more complex cycles at this
-                     * moment, but it could be possible, so let's double
-                     * check that we are hitting the expected case
-                     */
-                    if (hasTwoElements(stepsLeadingToCycle)) {
-                        const [startRef, endRef] =
-                            leastProblematicStepIndex === 0
-                                ? [
-                                      leastProblematicStep.commit,
-                                      stepsLeadingToCycle[1].commit,
-                                  ]
-                                : [
-                                      leastProblematicStep.commit,
-                                      stepsLeadingToCycle[0].commit,
-                                  ];
-                        const mostIssues = stepsLeadingToCycle.reduce(
-                            (a, b) =>
-                                Math.max(a, b.checkSummary.totalNumberOfIssues),
-                            0
-                        );
-                        const patch = await gitDiffRange({
+                    const key = editResult.key;
+                    assert(key);
+
+                    steps.push({
+                        task,
+                        key,
+                        patch: await gitFilesDiff({
                             location: input.sandboxDirectoryPath,
                             filePaths: [filePath],
-                            startRef,
-                            endRef,
-                        });
-                        //
-                        advice.push(
-                            formatFileDiff({
-                                fileDiff: patch,
-                                filePath,
-                                headline: `Do not make changes represented by the following diff as that would cause ${mostIssues} lint and compilation issues:`,
-                            })
-                        );
-                        await gitResetHard({
-                            location: input.sandboxDirectoryPath,
-                            ref: leastProblematicStep.commit,
-                        });
-                        const leastProblematicStepGlobalIndex = steps.findIndex(
-                            (value) => value === leastProblematicStep
-                        );
-                        steps.splice(
-                            leastProblematicStepGlobalIndex + 1,
-                            steps.length - leastProblematicStepGlobalIndex + 1
+                            ref: lastCommit(steps) || startCommit,
+                        }),
+                        fileContents: editResult.fileContents,
+                        commit,
+                        checkSummary,
+                        timestamp: performance.now(),
+                    });
+                } catch (exc) {
+                    if (exc instanceof CycleDetectedError) {
+                        const error = exc;
+                        const index = steps.findIndex(
+                            (step) => step.key === error.key
                         );
-                        issues.splice(
-                            0,
-                            issues.length,
-                            ...leastProblematicStep.checkSummary.newIssues,
-                            ...leastProblematicStep.checkSummary.remainingIssues
-                        );
-
-                        logger.debug('Handling cycle', {
-                            leastProblematicStepIndex,
-                            leastProblematicStepCommit:
-                                leastProblematicStep.commit,
-                            key: error.key,
-                            advice,
-                            steps: steps.map((step) => ({
-                                task: step.task,
-                                key: basename(step.key),
-                                commit: step.commit,
-                                totalNumberOfIssues:
-                                    step.checkSummary.totalNumberOfIssues,
-                            })),
-                        });
+                        const stepsLeadingToCycle = steps
+                            .slice(index)
+                            .filter((step) => step.commit !== undefined);
+                        const leastProblematicStep = ensureHasOneElement(
+                            orderBy(
+                                stepsLeadingToCycle,
+                                (step) =>
+                                    step.checkSummary?.totalNumberOfIssues ?? 0,
+                                'asc'
+                            )
+                        )[0];
+                        const leastProblematicStepIndex =
+                            stepsLeadingToCycle.findIndex(
+                                (value) => value === leastProblematicStep
+                            );
+
+                        /**
+                         * @todo haven't seen more complex cycles at this
+                         * moment, but it could be possible, so let's double
+                         * check that we are hitting the expected case
+                         */
+                        if (hasTwoElements(stepsLeadingToCycle)) {
+                            const [startRef, endRef] =
+                                leastProblematicStepIndex === 0
+                                    ? [
+                                          leastProblematicStep.commit,
+                                          stepsLeadingToCycle[1].commit,
+                                      ]
+                                    : [
+                                          leastProblematicStep.commit,
+                                          stepsLeadingToCycle[0].commit,
+                                      ];
+                            const mostIssues = stepsLeadingToCycle.reduce(
+                                (a, b) =>
+                                    Math.max(
+                                        a,
+                                        b.checkSummary?.totalNumberOfIssues ?? 0
+                                    ),
+                                0
+                            );
+
+                            // we check for commit !== undefined above
+                            assert(startRef);
+                            assert(endRef);
+                            assert(leastProblematicStep.commit);
+
+                            const patch = await gitDiffRange({
+                                location: input.sandboxDirectoryPath,
+                                filePaths: [filePath],
+                                startRef,
+                                endRef,
+                            });
+                            //
+                            advice.push(
+                                formatFileDiff({
+                                    fileDiff: patch,
+                                    filePath,
+                                    headline: `Do not make changes represented by the following diff as that would cause ${mostIssues} lint and compilation issues:`,
+                                })
+                            );
+                            await gitResetHard({
+                                location: input.sandboxDirectoryPath,
+                                ref: leastProblematicStep.commit,
+                            });
+                            const leastProblematicStepGlobalIndex =
+                                steps.findIndex(
+                                    (value) => value === leastProblematicStep
+                                );
+                            steps.splice(
+                                leastProblematicStepGlobalIndex + 1,
+                                steps.length -
+                                    leastProblematicStepGlobalIndex +
+                                    1
+                            );
+                            issues.splice(
+                                0,
+                                issues.length,
+                                ...(leastProblematicStep.checkSummary
+                                    ?.newIssues || []),
+                                ...(leastProblematicStep.checkSummary
+                                    ?.remainingIssues || [])
+                            );
+
+                            logger.debug('Handling cycle', {
+                                leastProblematicStepIndex,
+                                leastProblematicStepCommit:
+                                    leastProblematicStep.commit,
+                                key: error.key,
+                                advice,
+                                steps: steps.map((step) => ({
+                                    task: step.task,
+                                    key: basename(step.key),
+                                    commit: step.commit,
+                                    totalNumberOfIssues:
+                                        step.checkSummary
+                                            ?.totalNumberOfIssues ?? 0,
+                                })),
+                            });
+                        } else {
+                            logger.log(stepsLeadingToCycle);
+                            logger.log({
+                                leastProblematicStepIndex,
+                                leastProblematicStepCommit:
+                                    leastProblematicStep.commit,
+                                key: error.key,
+                                advice,
+                                steps: steps.map((step) => ({
+                                    task: step.task,
+                                    key: basename(step.key),
+                                    commit: step.commit,
+                                    totalNumberOfIssues:
+                                        step.checkSummary
+                                            ?.totalNumberOfIssues ?? 0,
+                                })),
+                            });
+                            throw exc;
+                        }
                     } else {
-                        logger.log(stepsLeadingToCycle);
-                        logger.log({
-                            leastProblematicStepIndex,
-                            leastProblematicStepCommit:
-                                leastProblematicStep.commit,
-                            key: error.key,
-                            advice,
-                            steps: steps.map((step) => ({
-                                task: step.task,
-                                key: basename(step.key),
-                                commit: step.commit,
-                                totalNumberOfIssues:
-                                    step.checkSummary.totalNumberOfIssues,
-                            })),
-                        });
                         throw exc;
                     }
-                } else {
-                    throw exc;
                 }
-            }
-        } while (issues.length > 0);
+            } while (issues.length > 0);
 
-        const file = {
-            status: 'success' as const,
-            ...buildResult(),
-        };
+            const file = {
+                status: 'success' as const,
+                ...buildResult(),
+            };
+
+            return {
+                ...(ctx.location && {
+                    key: ctx.location,
+                }),
+                file,
+                usage: refactorFileLlmUsage.finishCollecting(),
+            };
+        } catch (err) {
+            /**
+             * If there was any error - do not save to the cache
+             */
+            ctx.skipSavingToCache(err);
+
+            if (
+                err instanceof AbortError &&
+                !(err instanceof CycleDetectedError) &&
+                !(err instanceof OutOfContextBoundsError)
+            ) {
+                return Promise.reject(err);
+            }
 
-        return {
-            file,
-        };
+            return {
+                ...(ctx.location && {
+                    key: ctx.location,
+                }),
+                file: {
+                    status: 'failure' as const,
+                    failureDescription:
+                        err instanceof Error ? err.message : String(err),
+                    filePath: input.filePath,
+                    issues: [],
+                    steps: [],
+                    timestamp: performance.now(),
+                },
+                usage: refactorFileLlmUsage.finishCollecting(),
+            };
+        }
     },
 });
diff --git a/packages/refactor-bot/src/refactor/refactorGoal.ts b/packages/refactor-bot/src/refactor/refactorGoal.ts
index e89bc62..9a72f74 100644
--- a/packages/refactor-bot/src/refactor/refactorGoal.ts
+++ b/packages/refactor-bot/src/refactor/refactorGoal.ts
@@ -17,6 +17,7 @@ import { refactorConfigSchema } from './types';
 
 export const refactorGoalInputSchema = refactorConfigSchema.augment({
     objective: z.string(),
+    requirements: z.array(z.string()).nonempty(),
     sandboxDirectoryPath: z.string(),
     startCommit: z.string(),
     filesToEdit: z.array(z.string()),
diff --git a/packages/refactor-bot/src/refactor/resetToLastAcceptedCommit.ts b/packages/refactor-bot/src/refactor/resetToLastAcceptedCommit.ts
index 5e9fbbe..86885d9 100644
--- a/packages/refactor-bot/src/refactor/resetToLastAcceptedCommit.ts
+++ b/packages/refactor-bot/src/refactor/resetToLastAcceptedCommit.ts
@@ -11,10 +11,8 @@ export async function resetToLastAcceptedCommit(opts: {
     result: RefactorFilesResult;
 }) {
     const accepted = orderBy(
-        Object.entries(opts.result.accepted).flatMap(([_, entry]) =>
-            entry.filter((file) => file.lastCommit).map((file) => file)
-        ),
-        ['timestamp'],
+        opts.result.accepted.filter((entry) => entry.file.lastCommit),
+        ['file.timestamp'],
         ['desc']
     );
 
@@ -29,7 +27,8 @@ export async function resetToLastAcceptedCommit(opts: {
         location: opts.location,
         ref: 'HEAD',
     });
-    const lastCommit = accepted[0].lastCommit;
+
+    const lastCommit = accepted[0].file.lastCommit;
 
     if (lastCommit && lastCommit !== currentCommit) {
         logger.info(`Resetting to last successful commit ${lastCommit}`);
diff --git a/packages/refactor-bot/src/refactor/resultsCollector.ts b/packages/refactor-bot/src/refactor/resultsCollector.ts
index b73acea..ad52fb6 100644
--- a/packages/refactor-bot/src/refactor/resultsCollector.ts
+++ b/packages/refactor-bot/src/refactor/resultsCollector.ts
@@ -1,21 +1,73 @@
-import type { z } from 'zod';
+import { z } from 'zod';
 
-import { executionTiming } from '../cache/actions/executionStatus';
+import {
+    executionStarted,
+    executionTiming,
+} from '../cache/actions/executionStatus';
 import { explainCacheKey } from '../cache/cache';
-import { calculatePrice } from '../chat-gpt/api';
 import { actions } from '../event-bus';
 import { ofTypes } from '../event-bus/operators';
+import { extractErrorInfo } from '../logger/extractErrorInfo';
+import { logger } from '../logger/logger';
+import { line } from '../text/line';
 import { ensureHasOneElement, hasOneElement } from '../utils/hasOne';
 import { UnreachableError } from '../utils/UnreachableError';
 import { acceptedEdit } from './actions/acceptedEdit';
 import { checkoutSandboxCompleted } from './actions/checkoutSandboxCompleted';
 import { discardedEdit } from './actions/discardedEdit';
-import { gptRequestSuccess } from './actions/gptRequestSuccess';
 import { planFilesCompleted } from './actions/planFilesCompleted';
-import type { checkoutSandboxResultSchema } from './checkoutSandbox';
-import type { planFilesResultSchema } from './planFiles';
-import type { RefactorFilesResult } from './types';
-import { mutateToMergeRefactorRecords } from './types';
+import { checkoutSandboxResultSchema } from './checkoutSandbox';
+import { startCollectingLlmUsage } from './collectLlmUsage';
+import { planFilesResultSchema } from './planFiles';
+import type { RefactorConfig, refactorFileResultSchema } from './types';
+import { llmUsageEntrySchema, refactorFilesResultSchema } from './types';
+
+export const collectedRefactorResultSchema = z
+    .object({
+        objective: z.string(),
+        status: z.enum(['success', 'failure']),
+        error: z.record(z.unknown()).optional(),
+    })
+    .merge(checkoutSandboxResultSchema)
+    .merge(refactorFilesResultSchema)
+    .augment({
+        planFilesResults: z.array(planFilesResultSchema),
+        usage: z.array(llmUsageEntrySchema),
+        performance: z.object({
+            totalDurationMs: z.number(),
+            durationMsByStep: z.record(
+                z.object({
+                    durationMs: z.number(),
+                })
+            ),
+        }),
+    });
+
+const commitHashRegex = /^[a-f0-9]{40}$/;
+
+function scanForCommitHashes(data: unknown, ignore: string[]): string[] {
+    if (typeof data === 'object' && data !== null) {
+        const entries = Object.entries(data);
+        for (const [key, value] of entries) {
+            if (ignore.includes(key)) {
+                continue;
+            }
+
+            const result = scanForCommitHashes(value, ignore);
+            if (result.length > 0) {
+                return result;
+            }
+        }
+    }
+
+    if (typeof data === 'string') {
+        if (commitHashRegex.test(data) && !ignore.includes(data)) {
+            return [data];
+        }
+    }
+
+    return [];
+}
 
 /**
  * Results collector allows us to collect all the results of a refactor
@@ -33,29 +85,17 @@ import { mutateToMergeRefactorRecords } from './types';
 export function resultsCollector(deps = { actions }) {
     const timestamp = performance.now();
 
-    const files: RefactorFilesResult = {
-        accepted: {},
-        discarded: {},
-    };
-
     const checkoutResults: z.infer<typeof checkoutSandboxResultSchema>[] = [];
 
     const planFilesResults: z.infer<typeof planFilesResultSchema>[] = [];
 
-    const costsByStep: {
-        total: ReturnType<typeof calculatePrice>;
-        unknown: ReturnType<typeof calculatePrice>;
-    } & Record<string, ReturnType<typeof calculatePrice>> = {
-        total: {
-            completionPrice: 0,
-            promptPrice: 0,
-            totalPrice: 0,
-        },
-        unknown: {
-            completionPrice: 0,
-            promptPrice: 0,
-            totalPrice: 0,
-        },
+    const accepted: z.infer<typeof refactorFileResultSchema>[] = [];
+
+    const discarded: z.infer<typeof refactorFileResultSchema>[] = [];
+
+    const files = {
+        accepted,
+        discarded,
     };
 
     type DurationSample = {
@@ -66,6 +106,8 @@ export function resultsCollector(deps = { actions }) {
 
     const samples: DurationSample[] = [];
 
+    const usage = startCollectingLlmUsage();
+
     const subscription = deps
         .actions()
         .pipe(
@@ -74,7 +116,7 @@ export function resultsCollector(deps = { actions }) {
                 acceptedEdit,
                 discardedEdit,
                 planFilesCompleted,
-                gptRequestSuccess,
+                executionStarted,
                 executionTiming
             )
         )
@@ -85,53 +127,40 @@ export function resultsCollector(deps = { actions }) {
                         checkoutResults.push(event.data);
                         break;
                     case 'acceptedEdit':
-                        mutateToMergeRefactorRecords({
-                            from: {
-                                [event.data.filePath]: [event.data],
-                            },
-                            into: files.accepted,
-                        });
+                        accepted.push(event.data);
                         break;
                     case 'discardedEdit':
-                        mutateToMergeRefactorRecords({
-                            from: {
-                                [event.data.filePath]: [event.data],
-                            },
-                            into: files.discarded,
-                        });
+                        discarded.push(event.data);
                         break;
                     case 'planFilesCompleted':
                         planFilesResults.push(event.data);
                         break;
-                    case 'gptRequestSuccess':
+                    case 'executionStarted':
                         {
-                            const price = calculatePrice({
-                                ...event.data.response,
-                                model: event.data.model,
-                            });
+                            if (event.data.name === 'checkout-sandbox') {
+                                return;
+                            }
 
-                            const steps = explainCacheKey(event.data.key) ?? [
-                                { name: 'unknown', hash: 'unknown' },
-                            ];
+                            const hashes = scanForCommitHashes(
+                                event.data,
+                                checkoutResults
+                                    .map((r) => r.startCommit)
+                                    .concat(['diffHash', 'filesDiffHash'])
+                            );
 
-                            steps.push({ name: 'total', hash: 'unknown' });
-                            steps.forEach(({ name }) => {
-                                const current = costsByStep[name] ?? {
-                                    completionPrice: 0,
-                                    promptPrice: 0,
-                                    totalPrice: 0,
-                                };
-
-                                costsByStep[name] = {
-                                    completionPrice:
-                                        current.completionPrice +
-                                        price.completionPrice,
-                                    promptPrice:
-                                        current.promptPrice + price.promptPrice,
-                                    totalPrice:
-                                        current.totalPrice + price.totalPrice,
-                                };
-                            });
+                            if (hashes.length > 0) {
+                                logger.warn(
+                                    line`
+                                        Found commit hash in the input of a
+                                        cached function "${event.data.name}":
+                                        "${hashes.join(', ')}", this is likely
+                                        a mistake, because commits made at a
+                                        different time will result in different
+                                        hashes, which will invalidate the cache
+                                        and cause the function to run again.
+                                    `
+                                );
+                            }
                         }
                         break;
                     case 'executionTiming':
@@ -161,72 +190,78 @@ export function resultsCollector(deps = { actions }) {
         });
 
     const finalizePerformance = () => {
-        const totalDuration = performance.now() - timestamp;
+        const totalDurationMs = performance.now() - timestamp;
 
         type DurationSummary = {
-            duration: number;
+            durationMs: number;
         };
 
-        const durationByStep = samples.reduce<
+        const durationMsByStep = samples.reduce<
             { total: DurationSummary } & Record<string, DurationSummary>
         >(
             (acc, { duration, name, level }) => {
                 const existing = acc[name] ?? {
-                    duration: 0,
+                    durationMs: 0,
                 };
 
                 if (level === 1) {
-                    acc.total.duration += duration;
+                    acc.total.durationMs += duration;
                 }
 
                 return {
                     ...acc,
                     [name]: {
-                        duration: duration + existing.duration,
+                        durationMs: duration + existing.durationMs,
                     },
                 };
             },
             {
                 total: {
-                    duration: 0,
+                    durationMs: 0,
                 },
                 unknown: {
-                    duration: 0,
+                    durationMs: 0,
                 },
             }
         );
 
         return {
-            totalDuration,
-            durationByStep,
+            totalDurationMs,
+            durationMsByStep,
         };
     };
 
     return {
         teardown: () => {
             subscription.unsubscribe();
+            usage.finishCollecting();
         },
-        finalizeResults: (error?: Error) => {
+        finalizeResults: (
+            config: RefactorConfig,
+            error?: Error
+        ): z.output<typeof collectedRefactorResultSchema> => {
             if (error) {
                 if (!hasOneElement(checkoutResults)) {
                     throw error;
                 }
                 return {
                     status: 'failure' as const,
-                    error,
+                    error: extractErrorInfo(error),
+                    objective: config.objective,
                     ...ensureHasOneElement(checkoutResults)[0],
                     ...files,
                     planFilesResults,
-                    costsByStep,
+                    usage: usage.getUsage(),
                     performance: finalizePerformance(),
                 };
             } else {
                 return {
                     status: 'success' as const,
+                    objective: config.objective,
                     ...ensureHasOneElement(checkoutResults)[0],
                     ...files,
                     planFilesResults,
-                    costsByStep,
+                    usage: usage.getUsage(),
                     performance: finalizePerformance(),
                 };
             }
diff --git a/packages/refactor-bot/src/refactor/retrieveParameters.ts b/packages/refactor-bot/src/refactor/retrieveParameters.ts
index 8cd7e58..118f57f 100644
--- a/packages/refactor-bot/src/refactor/retrieveParameters.ts
+++ b/packages/refactor-bot/src/refactor/retrieveParameters.ts
@@ -1,123 +1,50 @@
 import { z } from 'zod';
 
-import { makeCachedFunction } from '../cache/makeCachedFunction';
-import { markdown } from '../markdown/markdown';
-import { format } from '../text/format';
-import { determineModelParameters } from './determineModelParameters';
-import { validateAndParseListOfFiles } from './parsers/validateAndParseListOfFiles';
-import { prompt } from './prompt';
-import { refactorConfigSchema } from './types';
-
-export const retrieveParametersInputSchema = refactorConfigSchema
-    .pick({
-        objective: true,
-        budgetCents: true,
-        model: true,
-        modelByStepCode: true,
-        useMoreExpensiveModelsOnRetry: true,
-        scope: true,
-        tsConfigJsonFileName: true,
-        allowedFunctions: true,
-    })
-    .augment({
-        sandboxDirectoryPath: z.string(),
+import type { CacheStateRef } from '../cache/types';
+import { extractRequirements } from '../evaluate/extractRequirements';
+import {
+    determineFilesToEdit,
+    determineFilesToEditResultSchema,
+} from './filesToEdit';
+import { refactorConfigPromptOptsSchema } from './prompt';
+
+export const retrieveParametersInputSchema =
+    refactorConfigPromptOptsSchema.augment({
+        objective: z.string(),
+        filesToEdit: z.array(z.string()).nonempty().optional(),
     });
 
-export const retrieveParametersResultSchema = z.object({
-    /**
-     * List of files allowed to be edited or refactored when mentioned in the
-     * objective, in the same order as mentioned in the objective.
-     *
-     * If the objective does not mention any files, this field is empty.
-     */
-    filesToEdit: z.array(z.string()),
-});
-
-export type RetrieveParametersResponse = z.infer<
-    typeof retrieveParametersResultSchema
->;
-
-const systemPrompt = markdown`
-    Think step by step. Be concise and to the point. Do not make assumptions
-    other than what was given in the instructions.
-`;
-
-const listFilesToEditPromptText = (objective: string) =>
-    format(
-        markdown`
-            %objective%
-
-            Given the above objective that implies modifications to source code,
-            follow the steps below to determine the list of files the user wants
-            us to edit or refactor.
-
-            1. Check the objective for mentions of files to be changed, edited
-               or refactored. Could be a single file or multiple files.
-
-            2. If there is no mention of files respond "No mention of files to
-               be edited or refactored".
-
-            3. If the resulting list from step #1 is not empty, format it -
-               return one file path per line in your response. File paths should
-               be surrounded by a backtick. File paths should be relative to
-               repository root. The result must be a numbered list in the
-               format:
-
-            #. \`path/to/file.ts\`
-
-            #. \`path/to/another/file.ts\`
+export const retrieveParametersResultSchema =
+    determineFilesToEditResultSchema.augment({
+        requirements: z.array(z.string()).nonempty(),
+    });
 
-            The number of each entry must be followed by a period. Do not prefix
-            the list of files with any text. Do not follow the list of files
-            with any other text. If the objective asks to edit in a specific
-            order, retain the order.
-        `,
+export const retrieveParameters = async (
+    input: z.input<typeof retrieveParametersInputSchema>,
+    ctx?: CacheStateRef
+) => {
+    const { filesToEdit } = input.filesToEdit
+        ? {
+              filesToEdit: input.filesToEdit,
+          }
+        : await determineFilesToEdit(input, ctx);
+
+    const { choices } = await extractRequirements(
         {
-            objective,
-        }
+            ...input,
+            choices: 2,
+        },
+        ctx
     );
 
-export const retrieveParameters = makeCachedFunction({
-    name: 'retrieve-parameters',
-    inputSchema: retrieveParametersInputSchema,
-    resultSchema: retrieveParametersResultSchema,
-    transform: async (input, ctx) => {
-        const allowedFilesResult = await prompt(
-            {
-                preface: systemPrompt,
-                prompt: listFilesToEditPromptText(input.objective),
-                temperature: 0.2,
-                budgetCents: input.budgetCents,
-                functionsConfig: {
-                    repositoryRoot: input.sandboxDirectoryPath,
-                    scope: input.scope,
-                    tsConfigJsonFileName: input.tsConfigJsonFileName,
-                    /**
-                     * @note we do not want to allow any functions for this prompt
-                     */
-                    allowedFunctions: [],
-                },
-                ...determineModelParameters(input, ctx),
-                shouldStop: async (message) => {
-                    await validateAndParseListOfFiles({
-                        sandboxDirectoryPath: input.sandboxDirectoryPath,
-                        text: message.content,
-                        sortBySize: false,
-                    });
-                    return true as const;
-                },
-            },
-            ctx
-        );
-
-        const filesToEdit = await validateAndParseListOfFiles({
-            sandboxDirectoryPath: input.sandboxDirectoryPath,
-            text: allowedFilesResult.choices[0].resultingMessage.content,
-            sortBySize: false,
-        });
+    const requirements = choices.reduce(
+        (acc, choice) =>
+            choice.requirements.length > acc.length ? acc : choice.requirements,
+        choices[0].requirements
+    );
 
-        return {
-            filesToEdit,
-        };
-    },
-});
+    return {
+        filesToEdit,
+        requirements,
+    };
+};
diff --git a/packages/refactor-bot/src/refactor/runRefactor.ts b/packages/refactor-bot/src/refactor/runRefactor.ts
index d522206..74ff7aa 100644
--- a/packages/refactor-bot/src/refactor/runRefactor.ts
+++ b/packages/refactor-bot/src/refactor/runRefactor.ts
@@ -12,12 +12,14 @@ import { formatObject } from '../logger/formatObject';
 import { glowFormat } from '../markdown/glowFormat';
 import { markdown } from '../markdown/markdown';
 import { goToEndOfFile } from '../prompt/editor';
+import { formatFencedCodeBlock } from '../prompt-formatters/formatFencedCodeBlock';
+import { formatOptional } from '../prompt-formatters/formatOptional';
 import { format } from '../text/format';
-import { line } from '../text/line';
 import { hasOneElement } from '../utils/hasOne';
+import { summarizeLlmUsagePrice } from './collectLlmUsage';
 import { loadRefactorConfigs } from './loadRefactors';
 import { refactor } from './refactor';
-import type { RefactorConfig } from './types';
+import { type RefactorConfig, summarizeRefactorFilesResult } from './types';
 
 const validateUsingSchema =
     <T extends z.ZodType>(schema: T) =>
@@ -128,6 +130,15 @@ async function determineConfig(opts: {
             );
         }
 
+        if (opts.id) {
+            return {
+                config: {
+                    ...config,
+                    id: opts.id,
+                },
+            };
+        }
+
         return {
             config,
         };
@@ -177,14 +188,19 @@ const currentRepositoryRefactoringReport = async (
         successBranch: string;
     }
 ) => {
-    const { accepted, discarded, successBranch, sandboxDirectoryPath } = opts;
+    const { successBranch, sandboxDirectoryPath } = opts;
 
-    const perFile = Object.entries(accepted)
+    const { accepted, discarded } = summarizeRefactorFilesResult(opts);
+
+    const perFile = Object.entries(accepted.resultsByFilePaths)
         .flatMap(([file, results]) => {
-            const firstCommit = results[0]?.steps[0]?.commit?.substring(0, 7);
+            const firstCommit = results[0]?.file.steps[0]?.commit?.substring(
+                0,
+                7
+            );
             const lastCommit = results[
                 results.length - 1
-            ]?.lastCommit?.substring(0, 7);
+            ]?.file.lastCommit?.substring(0, 7);
             if (!firstCommit || !lastCommit) {
                 return [];
             }
@@ -194,22 +210,25 @@ const currentRepositoryRefactoringReport = async (
         })
         .join('\n');
 
-    const firstCommits = Object.entries(discarded)
+    const firstCommits = Object.entries(discarded.resultsByFilePaths)
         .flatMap(([file, results]) => {
-            const firstCommit = results[0]?.steps[0]?.commit?.substring(0, 7);
+            const firstCommit = results[0]?.file.steps[0]?.commit?.substring(
+                0,
+                7
+            );
             if (!firstCommit) {
-                return [`# ${file}\n# No commits found for this file`];
+                return [];
             }
             return [`# ${file}\ngit cherry-pick -n ${firstCommit}`];
         })
         .join('\n');
 
     const failedToRefactor =
-        Object.keys(discarded).length > 0
+        Object.keys(discarded.resultsByFilePaths).length > 0
             ? markdown`
-                Failed to refactor:
+                Attempted to refactor:
 
-                ${Object.keys(discarded)
+                ${Object.keys(discarded.resultsByFilePaths)
                     .map((file, i) => `${i + 1}. \`${file}\``)
                     .join('\n')}
             `
@@ -219,9 +238,9 @@ const currentRepositoryRefactoringReport = async (
     if (Object.keys(discarded).length > 0 && firstCommits) {
         firstCommitsOfFailures = format(
             markdown`
-                ## First commits of failed files
+                ## Change attempts that failed
 
-                These are least invasive commits focused on the goal which
+                These should be least invasive commits focused on the goal which
                 didn't pass checks. You can try to fix them manually.
 
                 ~~~sh
@@ -232,7 +251,7 @@ const currentRepositoryRefactoringReport = async (
         );
     }
 
-    const successfullyRefactored = Object.keys(accepted)
+    const successfullyRefactored = Object.keys(accepted.resultsByFilePaths)
         .map((file, i) => `${i + 1}. \`${file}\``)
         .join('\n');
 
@@ -300,20 +319,45 @@ const currentRepositoryRefactoringReport = async (
 const currentRepositoryFailedRefactoringReport = async (
     opts: RefactorResult
 ) => {
-    const { discarded, sandboxDirectoryPath } = opts;
+    const { sandboxDirectoryPath } = opts;
 
-    const firstCommits = Object.entries(discarded)
+    const { discarded } = summarizeRefactorFilesResult(opts);
+
+    const firstCommits = Object.entries(discarded.resultsByFilePaths)
         .flatMap(([file, results]) => {
-            const firstCommit = results[0]?.steps[0]?.commit?.substring(0, 7);
+            const firstCommit = results[0]?.file.steps[0]?.commit?.substring(
+                0,
+                7
+            );
             if (!firstCommit) {
-                return [`# ${file}\n# No commits found for this file`];
+                return [];
             }
             return [`# ${file}\ngit cherry-pick -n ${firstCommit}`];
         })
         .join('\n');
 
-    const failedToRefactor = Object.keys(discarded)
-        .map((file, i) => `${i + 1}. \`${file}\``)
+    const failedToRefactor = Object.entries(discarded.resultsByFilePaths)
+        .map(([file, results], i) => {
+            const lastResult = results[results.length - 1]?.file;
+            if (lastResult?.status === 'failure') {
+                return `${i + 1}. \`${file}\` - ${
+                    lastResult.failureDescription
+                }`;
+            } else if (lastResult?.status === 'success') {
+                if (
+                    lastResult.steps.length === 1 &&
+                    !lastResult.steps[0]?.commit
+                ) {
+                    return `${
+                        i + 1
+                    }. \`${file}\` - the file appears to not require any changes`;
+                } else {
+                    return `${i + 1}. \`${file}\``;
+                }
+            } else {
+                return `${i + 1}. \`${file}\``;
+            }
+        })
         .join('\n');
 
     return format(
@@ -326,22 +370,27 @@ const currentRepositoryFailedRefactoringReport = async (
 
                     \`$sandboxDirectoryPath$\`
 
-                    Failed to refactor:
+                    Attempted to refactor:
 
                     %failedToRefactor%
 
-                    ## First commits of failed files
-
-                    These are least invasive commits focused on the goal which
-                    didn't pass checks. You can try to fix them manually.
-
-                    ~~~sh
                     %firstCommits%
-                    ~~~
                 `,
                 {
                     failedToRefactor,
-                    firstCommits,
+                    firstCommits: formatOptional({
+                        text: formatFencedCodeBlock({
+                            code: firstCommits,
+                            language: 'sh',
+                        }),
+                        heading: markdown`
+                            ## Change attempts that failed
+
+                            These should be least invasive commits focused on
+                            the goal which didn't pass checks. You can try to
+                            fix them manually.
+                        `,
+                    }),
                 }
             ),
         }),
@@ -395,7 +444,7 @@ const currentRepositoryPlanningEmptyReasonReport = async (
     );
 };
 
-const unhandledErrorReport = async (opts: { error: Error }) => {
+const unhandledErrorReport = async (opts: { error: unknown }) => {
     console.log(
         await glowFormat({
             input: format(
@@ -407,9 +456,14 @@ const unhandledErrorReport = async (opts: { error: Error }) => {
                     ~~~
                 `,
                 {
-                    errorDetails: formatObject(extractErrorInfo(opts.error), {
-                        indent: '',
-                    }),
+                    errorDetails: formatObject(
+                        opts.error instanceof Error
+                            ? extractErrorInfo(opts.error)
+                            : (opts.error as object),
+                        {
+                            indent: '',
+                        }
+                    ),
                 }
             ),
         })
@@ -448,26 +502,18 @@ const analyticsReport = async (opts: {
     performance: boolean;
 }) => {
     if (opts.costs) {
-        const total = opts.result.costsByStep.total.totalPrice;
-        const costs = Object.entries(opts.result.costsByStep)
-            .map(([step, costs]) => ({
+        const totalCost = summarizeLlmUsagePrice({
+            usage: opts.result.usage,
+        });
+        const costs = Array.from(totalCost.priceBySteps).map(
+            ([step, costs]) => ({
                 step,
                 costs,
-            }))
-            .filter((data) => data.step !== 'total' && data.step !== 'unknown');
+            })
+        );
 
         const orderedCosts = orderBy(costs, ['costs.totalPrice'], ['desc']);
 
-        const warning =
-            total === 0
-                ? '\n' +
-                  line`
-                      **NOTE** The costs cannot be collected if we cache high
-                      level steps. Try running the refactor with following
-                      options: \`--enable-cache-for chat exec check auto-fix\`
-                  `
-                : ``;
-
         console.log(
             await glowFormat({
                 input: format(
@@ -477,11 +523,9 @@ const analyticsReport = async (opts: {
                         Total cost: %total% USD
 
                         %costByStep%
-
-                        %warning%
                     `,
                     {
-                        total: total.toFixed(3),
+                        total: totalCost.totalPrice.toFixed(3),
                         costByStep: orderedCosts
                             .filter(
                                 /**
@@ -500,7 +544,6 @@ const analyticsReport = async (opts: {
                                 })
                             )
                             .join('\n'),
-                        warning,
                     }
                 ),
             })
@@ -510,14 +553,14 @@ const analyticsReport = async (opts: {
     if (opts.performance) {
         const perf = opts.result.performance;
 
-        const perfEntries = Object.entries(perf.durationByStep)
+        const perfEntries = Object.entries(perf.durationMsByStep)
             .map(([step, data]) => ({
                 step,
-                duration: data.duration,
+                durationMs: data.durationMs,
             }))
             .filter((data) => data.step !== 'total' && data.step !== 'unknown');
 
-        const orderedDurations = orderBy(perfEntries, ['duration'], ['desc']);
+        const orderedDurations = orderBy(perfEntries, ['durationMs'], ['desc']);
 
         console.log(
             await glowFormat({
@@ -531,12 +574,7 @@ const analyticsReport = async (opts: {
                     `,
                     {
                         totalDuration:
-                            (perf.totalDuration / 1000).toFixed(3) + ' sec',
-
-                        measuredDuration:
-                            (perf.durationByStep.total.duration / 1000).toFixed(
-                                3
-                            ) + ' sec',
+                            (perf.totalDurationMs / 1000).toFixed(3) + ' sec',
 
                         durationByStep: orderedDurations
                             .filter(
@@ -553,7 +591,7 @@ const analyticsReport = async (opts: {
                                 format(`- %step% - %duration%`, {
                                     step: data.step.padEnd(20, ' '),
                                     duration:
-                                        (data.duration / 1000).toFixed(3) +
+                                        (data.durationMs / 1000).toFixed(3) +
                                         ' sec',
                                 })
                             )
@@ -570,9 +608,9 @@ export async function runRefactor(opts: {
     name?: string;
     saveToCache?: boolean;
     enableCacheFor?: string[];
+    disableCacheFor?: string[];
     costs?: boolean;
     performance?: boolean;
-    //
 }) {
     try {
         const configs = await loadRefactorConfigs();
@@ -586,6 +624,7 @@ export async function runRefactor(opts: {
             config,
             saveToCache: opts.saveToCache,
             enableCacheFor: opts.enableCacheFor,
+            disableCacheFor: opts.disableCacheFor,
         });
 
         if (result.status === 'failure') {
@@ -607,6 +646,7 @@ export async function runRefactor(opts: {
                 );
             }
         }
+
         await analyticsReport({
             result,
             costs: opts.costs ?? true,
diff --git a/packages/refactor-bot/src/refactor/types.ts b/packages/refactor-bot/src/refactor/types.ts
index 916d322..094b8f3 100644
--- a/packages/refactor-bot/src/refactor/types.ts
+++ b/packages/refactor-bot/src/refactor/types.ts
@@ -23,6 +23,18 @@ export const refactorConfigSchema = z.object({
      */
     objective: z.string(),
 
+    /**
+     * List of files to edit or refactor - the refactoring will be done in the
+     * same order as the files are specified here. File names should be relative
+     * to the repository root.
+     *
+     * When not explicitly specified, the files are determined from the
+     * objective. When the objective doesn't explicitly state the files to be
+     * edited or refactored, the files are determined automatically by the LLM
+     * by analyzing the objective and contents of the repository.
+     */
+    filesToEdit: z.array(z.string()).nonempty().optional(),
+
     /**
      * Name of the `tsconfig.json` file to use for the refactor, defaults
      * to `tsconfig.json`. In mono-repos scenarios this will affect the name
@@ -168,6 +180,22 @@ export const refactorConfigSchema = z.object({
         })
         .optional(),
 
+    /**
+     * Whether to evaluate every file before accepting it as success and
+     * committing it to the repository.
+     *
+     * This will lead to a slower refactor, but will ensure that the files
+     * reported as refactored are actually valid and lead to a reported outcome
+     * that we can rely on.
+     */
+    evaluate: z.boolean().optional().default(true),
+
+    /**
+     * Minimal score to accept a file as refactored, only used when the
+     * `evaluate` option is enabled. Defaults to 0.5.
+     */
+    evaluateMinScore: z.number().optional().default(0.5),
+
     /**
      * Unique identifier of the refactor, used to identify and restore
      * the refactor state and finding the right sandbox when running
@@ -178,23 +206,39 @@ export const refactorConfigSchema = z.object({
 
 export type RefactorConfig = z.input<typeof refactorConfigSchema>;
 
+export const issueSchema = z.object({
+    command: z.string(),
+    issue: z.string(),
+    filePath: z.string(),
+    commit: z.string(),
+    code: z.string().optional(),
+});
+
+export const checkSummarySchema = z.object({
+    newIssues: z.array(issueSchema),
+    remainingIssues: z.array(issueSchema),
+    resolvedIssues: z.array(issueSchema),
+    totalNumberOfIssues: z.number(),
+});
+
 export const refactorStepResultSchema = z.object({
     task: z.string(),
+    key: z.string().optional(),
     fileContents: z.string(),
-    commit: z.string(),
+    commit: z.string().optional(),
     timestamp: z.number(),
+    checkSummary: checkSummarySchema.optional(),
 });
 
-export type RefactorStepResult = z.infer<typeof refactorStepResultSchema>;
-
-export const issueSchema = z.object({
-    command: z.string(),
-    issue: z.string(),
-    filePath: z.string(),
+export const refactorStepEmptyResultSchema = z.object({
+    task: z.string(),
     commit: z.string(),
-    code: z.string().optional(),
+    timestamp: z.number(),
+    checkSummary: checkSummarySchema,
 });
 
+export type RefactorStepResult = z.infer<typeof refactorStepResultSchema>;
+
 export const refactorSuccessResultSchema = z.object({
     status: z.literal('success'),
     filePath: z.string(),
@@ -221,107 +265,70 @@ export const refactorResultSchema = z.discriminatedUnion('status', [
 
 export type RefactorResult = z.infer<typeof refactorResultSchema>;
 
+export const llmUsageEntrySchema = z.object({
+    model: z.string(),
+    steps: z.array(z.string()),
+    usage: z.object({
+        promptTokens: z.number(),
+        completionTokens: z.number(),
+        totalTokens: z.number(),
+    }),
+});
+
+export const evaluateFileScoreSchema = z.object({
+    key: z.string().optional(),
+    score: z.number(),
+});
+
 export const refactorFileResultSchema = z.object({
+    /**
+     * Optional key to identify the file in the cache - for diagnostic
+     * purposes only.
+     */
+    key: z.string().optional(),
     file: refactorResultSchema,
+    usage: z.array(llmUsageEntrySchema),
+    evaluation: evaluateFileScoreSchema.optional(),
 });
 
 export type RefactorFileResult = z.infer<typeof refactorFileResultSchema>;
 
-export const refactorFilesRecordSchema = z.record(
-    z.string(),
-    z.array(refactorResultSchema)
-);
-
-export type RefactorResultByFilePathRecord = z.infer<
-    typeof refactorFilesRecordSchema
->;
-
 export const refactorFilesResultSchema = z.object({
-    accepted: z.record(z.string(), z.array(refactorResultSchema)),
-    discarded: z.record(z.string(), z.array(refactorResultSchema)),
+    accepted: z.array(refactorFileResultSchema),
+    discarded: z.array(refactorFileResultSchema),
 });
 
 export type RefactorFilesResult = z.infer<typeof refactorFilesResultSchema>;
 
-export const lastCommit = <T extends { commit: string }>(steps: T[]) => {
-    return steps[steps.length - 1]?.commit;
-};
-
-export const lastTimestamp = <T extends { timestamp: number }>(steps: T[]) => {
-    return steps[steps.length - 1]?.timestamp;
-};
-
-export const pushRefactorFileResults = (opts: {
-    result: RefactorResult;
-    into: RefactorResultByFilePathRecord;
-}) => {
-    const array = opts.into[opts.result.filePath];
-    if (array) {
-        array.push(opts.result);
-    } else {
-        opts.into[opts.result.filePath] = [opts.result];
+export const firstCommit = <T extends { commit: string }>(steps: T[]) => {
+    const first = steps[0];
+    if (!first) {
+        return undefined;
     }
+    return first.commit;
 };
 
-export const mutateToMergeRefactorRecords = (opts: {
-    from: RefactorResultByFilePathRecord;
-    into: RefactorResultByFilePathRecord;
-}) => {
-    for (const [file, tasks] of Object.entries(opts.from)) {
-        const existing = opts.into[file];
-        if (existing) {
-            opts.into[file] = existing.concat(tasks);
-        } else {
-            opts.into[file] = tasks;
-        }
+export const lastCommit = <T extends { commit?: string; lastCommit?: string }>(
+    steps: T[]
+) => {
+    const last = steps.filter((step) => step.commit || step.lastCommit)[
+        steps.length - 1
+    ];
+    if (!last) {
+        return undefined;
     }
+    return last.commit || last.lastCommit;
 };
 
-const moveRefactorFileResults = (opts: {
-    filePath: string;
-    into: RefactorResultByFilePathRecord;
-    from: RefactorResultByFilePathRecord;
-}) => {
-    opts.into[opts.filePath] = (opts.from[opts.filePath] || []).concat(
-        opts.into[opts.filePath] || []
-    );
-    delete opts.from[opts.filePath];
-};
-
-export const mutateToMergeRefactorFilesResults = (opts: {
-    from: RefactorFilesResult;
-    into: RefactorFilesResult;
-}) => {
-    mutateToMergeRefactorRecords({
-        from: opts.from.accepted,
-        into: opts.into.accepted,
-    });
-    mutateToMergeRefactorRecords({
-        from: opts.from.discarded,
-        into: opts.into.discarded,
-    });
-
-    /**
-     * @note if we get a file in discarded list after previously
-     * getting a file in accepted list, we move the file to the
-     * discarded list
-     */
-    for (const filePath of Object.keys(opts.into.discarded)) {
-        const accepted = opts.into.accepted[filePath];
-        if (accepted) {
-            moveRefactorFileResults({
-                filePath,
-                from: opts.into.accepted,
-                into: opts.into.discarded,
-            });
-        }
-    }
+export const lastTimestamp = <T extends { timestamp: number }>(steps: T[]) => {
+    return steps[steps.length - 1]?.timestamp;
 };
 
 export type Issue = z.infer<typeof issueSchema>;
 
 export const checkIssuesResultSchema = z.object({
     checkedFiles: z.array(z.string()).optional(),
+    commands: z.array(z.string()),
     issues: z.array(
         z.object({
             command: z.string(),
@@ -333,3 +340,64 @@ export const checkIssuesResultSchema = z.object({
 });
 
 export type CheckIssuesResult = z.infer<typeof checkIssuesResultSchema>;
+
+export const summarizeRefactorFileResultArray = (
+    results: Array<RefactorFileResult>
+) => {
+    const filePaths = results.map((result) => result.file.filePath);
+    return {
+        resultsByFilePaths: Object.fromEntries(
+            filePaths.map((filePath) => [
+                filePath,
+                results.filter((result) => result.file.filePath === filePath),
+            ])
+        ),
+        usageByFilePaths: Object.fromEntries(
+            filePaths.map((filePath) => [
+                filePath,
+                results
+                    .filter((result) => result.file.filePath === filePath)
+                    .flatMap((result) => result.usage),
+            ])
+        ),
+    };
+};
+
+export const summarizeRefactorFilesResult = (
+    results: RefactorFilesResult,
+    opts?: {
+        filterOutDuplicatesFromDiscarded?: boolean;
+    }
+) => {
+    const accepted = summarizeRefactorFileResultArray(results.accepted);
+    const discarded = summarizeRefactorFileResultArray(results.discarded);
+
+    /**
+     * The algorithm is built in a way that it relies on "planFiles" to
+     * return a list of files to be refactored. This means that if a file
+     * was already refactored, but for some reason the refactoring produced
+     * an outcome that "planFiles" wants to still refactor, the file will
+     * be refactored again - which will cause an infinite loop, which will be
+     * safely detected by the algorithm and the refactoring for that file will
+     * be aborted.
+     *
+     * This will cause some "accepted" files to be duplicated in the "discarded"
+     * list, which we filter out here for cleaner UX purposes - the users do not
+     * see the "discarded" commits anyway.
+     */
+    if (opts?.filterOutDuplicatesFromDiscarded ?? true) {
+        const discardedFilePaths = Object.keys(discarded.resultsByFilePaths);
+        const duplicatedFilePaths = discardedFilePaths.filter(
+            (filePath) => filePath in accepted.resultsByFilePaths
+        );
+        duplicatedFilePaths.forEach((filePath) => {
+            delete discarded.resultsByFilePaths[filePath];
+            delete discarded.usageByFilePaths[filePath];
+        });
+    }
+
+    return {
+        accepted,
+        discarded,
+    };
+};
diff --git a/packages/refactor-bot/src/response-parsers/parseFencedCodeBlocks.test.ts b/packages/refactor-bot/src/response-parsers/parseFencedCodeBlocks.test.ts
new file mode 100644
index 0000000..a2f2def
--- /dev/null
+++ b/packages/refactor-bot/src/response-parsers/parseFencedCodeBlocks.test.ts
@@ -0,0 +1,142 @@
+import { expect, it } from '@jest/globals';
+import dedent from 'dedent';
+
+import { markdown } from '../markdown/markdown';
+import { parseFencedCodeBlocks } from './parseFencedCodeBlocks';
+
+it('should return empty array when no blocks', () => {
+    expect(parseFencedCodeBlocks(`Hello world`)).toEqual([]);
+});
+
+it('should parse single block', () => {
+    expect(
+        parseFencedCodeBlocks(markdown`
+            ~~~TypeScript
+              // some code
+            ~~~
+        `)
+    ).toEqual([
+        {
+            code: `  // some code\n`,
+            language: 'TypeScript',
+        },
+    ]);
+});
+
+it('should be allowed without language tag', () => {
+    expect(
+        parseFencedCodeBlocks(markdown`
+            ~~~
+            unknown language
+            ~~~
+        `)
+    ).toEqual([
+        {
+            code: 'unknown language\n',
+        },
+    ]);
+});
+
+it('should allow empty code block', () => {
+    expect(
+        parseFencedCodeBlocks(dedent`
+            ~~~txt
+            ~~~
+        `)
+    ).toEqual([
+        {
+            code: '',
+            language: 'txt',
+        },
+    ]);
+});
+
+it('should parse backtick block', () => {
+    expect(
+        parseFencedCodeBlocks(dedent`
+            \`\`\`md
+            Hello!
+            \`\`\`
+        `)
+    ).toEqual([
+        {
+            code: `Hello!\n`,
+            language: 'md',
+        },
+    ]);
+});
+
+it('should parse multiple blocks', () => {
+    expect(
+        parseFencedCodeBlocks(dedent`
+            Confirm the file is there:
+            \`\`\`sh
+            ls .
+            \`\`\`
+
+            Then execute:
+            \`\`\`sh
+            cat file.txt
+            \`\`\`
+        `)
+    ).toEqual([
+        {
+            code: `ls .\n`,
+            language: 'sh',
+        },
+        {
+            code: `cat file.txt\n`,
+            language: 'sh',
+        },
+    ]);
+});
+
+it('should not trigger internal code blocks', () => {
+    expect(
+        parseFencedCodeBlocks(markdown`
+            ~~~TypeScript
+            const value = /* yaml */\`
+                \`\`\`TypeScript
+                  // some code
+                \`\`\`
+            \`
+            ~~~
+        `)
+    ).toEqual([
+        {
+            code: dedent/* ts */ `
+                const value = /* yaml */\`
+                    \`\`\`TypeScript
+                      // some code
+                    \`\`\`
+                \`\n
+            `,
+            language: 'TypeScript',
+        },
+    ]);
+});
+
+it('should not trigger other internal code blocks on the same level', () => {
+    expect(
+        parseFencedCodeBlocks(dedent`
+            ~~~~TypeScript
+            const value = /* ts */\`
+            ~~~TypeScript
+            // some code
+            ~~~
+            \`
+            ~~~~
+        `)
+    ).toEqual([
+        {
+            code: dedent`
+                const value = /* ts */\`
+                ~~~TypeScript
+                // some code
+                ~~~
+                \`\n
+            `,
+            language: 'TypeScript',
+        },
+    ]);
+});
diff --git a/packages/refactor-bot/src/response-parsers/parseFencedCodeBlocks.ts b/packages/refactor-bot/src/response-parsers/parseFencedCodeBlocks.ts
new file mode 100644
index 0000000..8ca8d30
--- /dev/null
+++ b/packages/refactor-bot/src/response-parsers/parseFencedCodeBlocks.ts
@@ -0,0 +1,60 @@
+import { line } from '../text/line';
+import { escapeRegExp } from '../utils/escapeRegExp';
+import { firstLineOf } from '../utils/firstLineOf';
+
+export function parseFencedCodeBlocks(text: string) {
+    const results: Array<{
+        code: string;
+        language?: string;
+    }> = [];
+
+    const openingMarkerRegex =
+        /^(?<openingMarker>(```*)|(~~~*))(?<language>\w+)?\n/gm;
+
+    let openingRes = openingMarkerRegex.exec(text);
+    while (openingRes) {
+        const { openingMarker, language } = openingRes.groups as {
+            openingMarker: string;
+            language?: string;
+        };
+
+        const closingMarkerRegex = new RegExp(
+            `^${escapeRegExp(openingMarker)}$`,
+            'gm'
+        );
+
+        closingMarkerRegex.lastIndex = openingMarkerRegex.lastIndex;
+        const closingRes = closingMarkerRegex.exec(text);
+
+        if (!closingRes) {
+            const firstLineOfCode = firstLineOf(
+                text.slice(openingMarkerRegex.lastIndex)
+            );
+            throw new Error(
+                line`
+                    Could not find closing fenced code block marker
+                    "${openingMarker}" for opening marker at index
+                    ${openingMarkerRegex.lastIndex} which starts as
+                    "${openingRes[0]}↩︎${firstLineOfCode}"
+                `
+            );
+        }
+
+        const code = text.slice(
+            openingMarkerRegex.lastIndex,
+            closingMarkerRegex.lastIndex - closingRes[0].length
+        );
+
+        results.push({
+            code,
+            ...(language && {
+                language,
+            }),
+        });
+
+        openingMarkerRegex.lastIndex = closingMarkerRegex.lastIndex;
+        openingRes = openingMarkerRegex.exec(text);
+    }
+
+    return results;
+}
diff --git a/packages/refactor-bot/src/response-parsers/parseJsonResponse.test.ts b/packages/refactor-bot/src/response-parsers/parseJsonResponse.test.ts
new file mode 100644
index 0000000..6b86080
--- /dev/null
+++ b/packages/refactor-bot/src/response-parsers/parseJsonResponse.test.ts
@@ -0,0 +1,58 @@
+import { expect, it } from '@jest/globals';
+import dedent from 'dedent';
+import { z } from 'zod';
+
+import { parseJsonResponse } from './parseJsonResponse';
+
+it('should return parsed response', () => {
+    expect(
+        parseJsonResponse(
+            dedent`
+                \`\`\`json
+                {
+                    "summary": "The objective was not achieved. The algorithm did not replace 'readFile' from 'fs/promises' with 'readFileSync' from 'fs'. Instead, it redefined 'readFile' within the 'defaultDeps' object to call itself recursively without any changes, which will lead to a stack overflow if executed.",
+                    "requirements": [
+                        {
+                            "description": "Replace all usages of 'readFile' from 'fs/promises' with 'readFileSync' from 'fs'",
+                            "isObjective": true,
+                            "satisfied": false
+                        },
+                        {
+                            "description": "The code produced is minimal and doesn't make modifications which are unnecessary or not requested by the user",
+                            "isObjective": false,
+                            "satisfied": false
+                        }
+                    ]
+                }
+                \`\`\`
+            `,
+            z.object({
+                summary: z.string(),
+                requirements: z.array(
+                    z.object({
+                        description: z.string(),
+                        isObjective: z.boolean(),
+                        satisfied: z.boolean(),
+                    })
+                ),
+            })
+        )
+    ).toEqual({
+        summary:
+            "The objective was not achieved. The algorithm did not replace 'readFile' from 'fs/promises' with 'readFileSync' from 'fs'. Instead, it redefined 'readFile' within the 'defaultDeps' object to call itself recursively without any changes, which will lead to a stack overflow if executed.",
+        requirements: [
+            {
+                description:
+                    "Replace all usages of 'readFile' from 'fs/promises' with 'readFileSync' from 'fs'",
+                isObjective: true,
+                satisfied: false,
+            },
+            {
+                description:
+                    "The code produced is minimal and doesn't make modifications which are unnecessary or not requested by the user",
+                isObjective: false,
+                satisfied: false,
+            },
+        ],
+    });
+});
diff --git a/packages/refactor-bot/src/response-parsers/parseJsonResponse.ts b/packages/refactor-bot/src/response-parsers/parseJsonResponse.ts
new file mode 100644
index 0000000..30e07e7
--- /dev/null
+++ b/packages/refactor-bot/src/response-parsers/parseJsonResponse.ts
@@ -0,0 +1,52 @@
+import { z, ZodError } from 'zod';
+
+import { parseJsonSchema } from '../utils/parseJsonSchema';
+import { parseFencedCodeBlocks } from './parseFencedCodeBlocks';
+
+export function parseJsonResponse<Schema extends z.ZodType<unknown>>(
+    content: string,
+    schema: Schema
+) {
+    const json = parseJsonSchema(schema);
+
+    const fencedJson = z
+        .string()
+        .transform((content, ctx) => {
+            try {
+                const blocks = parseFencedCodeBlocks(content).filter(
+                    (block) => !block.language || block.language === 'json'
+                );
+
+                if (blocks.length === 0) {
+                    ctx.addIssue({
+                        message: 'No fenced code blocks found of json type',
+                        code: z.ZodIssueCode.custom,
+                    });
+                }
+
+                return blocks.map((block) => block.code);
+            } catch (err) {
+                ctx.addIssue({
+                    message: String(err),
+                    code: z.ZodIssueCode.custom,
+                });
+                return z.NEVER;
+            }
+        })
+        .pipe(z.array(json).nonempty());
+
+    const nonFencedJson = json.transform((response) => [response] as const);
+
+    const schemas = [nonFencedJson, fencedJson];
+    const results = schemas.map((schema) => schema.safeParse(content));
+
+    const successResult = results.find((r) => r.success);
+
+    if (successResult && successResult.success) {
+        return successResult.data[0];
+    } else {
+        throw new ZodError([
+            ...results.flatMap((r) => (r.success ? [] : r.error.issues)),
+        ]);
+    }
+}
diff --git a/packages/refactor-bot/src/utils/hasOne.ts b/packages/refactor-bot/src/utils/hasOne.ts
index c67a60b..3017644 100644
--- a/packages/refactor-bot/src/utils/hasOne.ts
+++ b/packages/refactor-bot/src/utils/hasOne.ts
@@ -28,14 +28,14 @@ export function hasTwoElements(arr: unknown[] | readonly unknown[]): boolean {
 }
 
 export function ensureHasTwoElements<
-    Arr extends unknown[] | readonly unknown[]
+    Arr extends unknown[] | readonly unknown[],
 >(arr: Arr): [Arr[0], Arr[1], ...Arr[number][]];
 export function ensureHasTwoElements<Arr extends readonly unknown[]>(
     arr: Arr
 ): readonly [Arr[0], Arr[1], ...Arr[number][]];
 export function ensureHasTwoElements<
-    Arr extends unknown[] | readonly unknown[]
+    Arr extends unknown[] | readonly unknown[],
 >(arr: Arr) {
-    assert(hasOneElement<Arr[number]>(arr));
+    assert(hasTwoElements<Arr[number]>(arr));
     return arr;
 }
diff --git a/packages/refactor-bot/src/utils/isFileNotFoundError.ts b/packages/refactor-bot/src/utils/isFileNotFoundError.ts
new file mode 100644
index 0000000..52fc8f0
--- /dev/null
+++ b/packages/refactor-bot/src/utils/isFileNotFoundError.ts
@@ -0,0 +1,5 @@
+export function isFileNotFoundError(err: unknown) {
+    return Boolean(
+        typeof err === 'object' && err && 'code' in err && err.code === 'ENOENT'
+    );
+}
diff --git a/packages/refactor-bot/src/utils/parseJsonSchema.ts b/packages/refactor-bot/src/utils/parseJsonSchema.ts
new file mode 100644
index 0000000..ab12cb8
--- /dev/null
+++ b/packages/refactor-bot/src/utils/parseJsonSchema.ts
@@ -0,0 +1,22 @@
+import { z } from 'zod';
+
+export function parseJsonSchema<Schema extends z.ZodType>(schema: Schema) {
+    return z
+        .unknown()
+        .pipe(z.string())
+        .transform((content: string, ctx) => {
+            try {
+                return JSON.parse(content) as unknown;
+            } catch (err) {
+                const message =
+                    err instanceof Error ? err.message : String(err);
+                ctx.addIssue({
+                    message: `Not a valid JSON string - ${message}`,
+                    code: z.ZodIssueCode.custom,
+                    path: ctx.path,
+                });
+                return z.NEVER;
+            }
+        })
+        .pipe(schema);
+}
diff --git a/packages/refactor-bot/tsconfig.json b/packages/refactor-bot/tsconfig.json
index 57e4c48..a0a2be3 100644
--- a/packages/refactor-bot/tsconfig.json
+++ b/packages/refactor-bot/tsconfig.json
@@ -3,7 +3,7 @@
     "compilerOptions": {
         "outDir": ".tsc-out",
         "tsBuildInfoFile": ".tsc-out/.tsbuildinfo",
-        "noErrorTruncation": true
+        "noErrorTruncation": false
     },
     "include": ["src/**/*.ts", "src/ts-morph/builtinLibs.cjs"]
 }