-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_layer_sweep.py
99 lines (84 loc) · 3.81 KB
/
run_layer_sweep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""Script to compute steerability over all layers and plot the results"""
import torch
import numpy as np
import pathlib
import json
from steering_vectors import train_steering_vector
from steering_bench.build_training_data import build_steering_vector_training_data
from steering_bench.core.evaluate import evaluate_propensities_on_dataset
from steering_bench.utils.torch import load_model_with_quantization, EmptyTorchCUDACache
from steering_bench.dataset import build_dataset, DatasetSpec
from steering_bench.core.format import Formatter
from steering_bench.core.pipeline import Pipeline
from steering_bench.core.propensity import LogProbDifference
from steering_bench.core.hook import SteeringHook
from steering_bench.metric import get_steerability_slope
curr_dir = pathlib.Path(__file__).parent.absolute()
if __name__ == "__main__":
save_dir = curr_dir / "layer_sweep_results"
save_dir.mkdir(exist_ok=True)
# Load the dataset
dataset_name = "corrigible-neutral-HHH"
train_spec = DatasetSpec(name=dataset_name, split="0%:10%", seed=0)
test_spec = DatasetSpec(name=dataset_name, split="99%:100%", seed=0)
train_dataset = build_dataset(train_spec)
test_dataset = build_dataset(test_spec)
# Load the model and tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"
model, tokenizer = load_model_with_quantization(model_name, load_in_8bit=True)
formatter = Formatter()
pipeline = Pipeline(model=model, tokenizer=tokenizer, formatter=formatter)
# Train the steering vector, or load a saved one
sv_save_path = save_dir / "steering_vector.pt"
if sv_save_path.exists():
print("Loading steering vector")
steering_vector = torch.load(sv_save_path)
else:
print("Training steering vector")
training_data = build_steering_vector_training_data(pipeline, train_dataset)
steering_vector = train_steering_vector(
pipeline.model,
pipeline.tokenizer,
training_data,
)
torch.save(steering_vector, sv_save_path)
# Evaluate propensity and steerability
multipliers = np.array([-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5])
propensity_score = LogProbDifference()
steerabilities: dict[int, float] = {}
for layer in range(32):
propensity_save_path = save_dir / f"propensities_layer_{layer}.npy"
if propensity_save_path.exists():
print(f"Skipping layer {layer}")
continue
# Create the steering hook, which applies the steering vector to the model
steering_hook = SteeringHook(
steering_vector,
direction_multiplier=0.0, # Placeholder value; will be overwritten by evaluate_propensities
layer=layer,
patch_generation_tokens_only=True, # Only patch tokens generated by the model
skip_first_n_generation_tokens=1, # Skip the first token '('
patch_operator="add",
)
with EmptyTorchCUDACache():
print(f"Running layer {layer}")
pipeline.hooks.clear()
propensities = evaluate_propensities_on_dataset(
pipeline,
steering_hook,
test_dataset,
propensity_fn=propensity_score,
multipliers=multipliers,
)
assert len(pipeline.hooks) == 0
steerability = get_steerability_slope(multipliers, propensities)
print(
f"Steerability slope: {steerability.mean():.2f} +- {steerability.std():.2f}"
)
steerabilities[layer] = steerability.mean()
# Save propensities
np.save(propensity_save_path, propensities)
# Save steerabilities
steerability_save_path = save_dir / "steerabilities.json"
with open(steerability_save_path, "w") as f:
json.dump(steerabilities, f, indent=2)