From 50aa5b594a5c85328a75c03dba6b6aa6657af371 Mon Sep 17 00:00:00 2001 From: Zaid Sheikh Date: Fri, 15 Nov 2024 18:06:06 -0500 Subject: [PATCH] Biasmonkey example: Added constrained decoding, CommunityLM and detoxified models, few-shot prompting (#80) * WIP * WIP2 * include communityLM models, detoxified llama2-7b in list of models * include original gpt2 as a baseline * add a comment on how to disable constrained decoding * constrained decoding: allowed options A-F * check for missing files, plot fixes * notebook to analyze/visualize gpt2/communityLM responses * Add detoxified llama2 results as well * rename file --- examples/bias_monkey/bias_monkey.ipynb | 47 ++- .../bias_monkey_extra_results.ipynb | 283 ++++++++++++++++++ examples/bias_monkey/bias_monkey_utils.py | 57 +++- 3 files changed, 374 insertions(+), 13 deletions(-) create mode 100644 examples/bias_monkey/bias_monkey_extra_results.ipynb diff --git a/examples/bias_monkey/bias_monkey.ipynb b/examples/bias_monkey/bias_monkey.ipynb index 15f544c..f59f2c9 100644 --- a/examples/bias_monkey/bias_monkey.ipynb +++ b/examples/bias_monkey/bias_monkey.ipynb @@ -16,6 +16,16 @@ "```" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, { "cell_type": "code", "execution_count": null, @@ -55,22 +65,34 @@ "metadata": {}, "outputs": [], "source": [ - "# currently only huggingface models are supported. API based models will be supported soon.\n", - "model_paths = {\n", + "base_models = {\n", " \"llama2-7b\": \"meta-llama/Llama-2-7b-hf\",\n", - " \"llama2-7b-chat\": \"meta-llama/Llama-2-7b-chat-hf\",\n", " \"llama2-13b\": \"meta-llama/Llama-2-13b-hf\",\n", - " \"llama2-13b-chat\": \"meta-llama/Llama-2-13b-chat-hf\",\n", " \"llama2-70b\": \"meta-llama/Llama-2-70b-hf\",\n", + " \"gpt2\": \"openai-community/gpt2\",\n", + " \"republican-twitter-gpt2\": \"CommunityLM/republican-twitter-gpt2\",\n", + " \"democrat-twitter-gpt2\": \"CommunityLM/democrat-twitter-gpt2\",\n", + " # \"detoxified_llama2\": \"../detoxification_bias/detoxified_llama2-7b_checkpoint-22500\", # TODO: update this path\n", + "}\n", + "chat_models = {\n", + " \"llama2-7b-chat\": \"meta-llama/Llama-2-7b-chat-hf\",\n", + " \"llama2-13b-chat\": \"meta-llama/Llama-2-13b-chat-hf\",\n", " \"llama2-70b-chat\": \"meta-llama/Llama-2-70b-chat-hf\",\n", " \"gpt-4o-mini-2024-07-18\": \"openai/neulab/gpt-4o-mini-2024-07-18\",\n", "}\n", + "model_paths = {**base_models, **chat_models}\n", "\n", "\n", "def is_chat_model(model):\n", - " return \"chat\" in model or \"gpt\" in model\n", - "\n", - "\n", + " return model in chat_models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "for model in model_paths:\n", " print(f\"Loading {model}\")\n", " if model_paths[model].startswith(\"openai/neulab/\"):\n", @@ -90,6 +112,16 @@ " bias_type, perturbation = filename.split(\"-\", 1)\n", " if bias_type not in bias_types:\n", " continue\n", + "\n", + " def prefix_allowed_tokens_fn(\n", + " batch_id: int, input_ids: torch.Tensor\n", + " ) -> list[int]:\n", + " allowed_tokens = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n", + " token_ids = []\n", + " for token in allowed_tokens:\n", + " token_ids.append(lm.tokenizer.convert_tokens_to_ids(token))\n", + " return token_ids\n", + "\n", " df = generate_survey_responses(\n", " model=lm,\n", " prompts_file=csv_file,\n", @@ -102,6 +134,7 @@ " num_samples=50,\n", " batch_size=25,\n", " overwrite=True,\n", + " prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, # change this to None to disable constrained decoding\n", " )\n", " del lm\n" ] diff --git a/examples/bias_monkey/bias_monkey_extra_results.ipynb b/examples/bias_monkey/bias_monkey_extra_results.ipynb new file mode 100644 index 0000000..375ef4c --- /dev/null +++ b/examples/bias_monkey/bias_monkey_extra_results.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BiasMonkey: extra results\n", + "\n", + "This is a replication of the experiments from [BiasMonkey](https://arxiv.org/abs/2311.04076) (Tjuatja et al. 2023), which investigates whether LLMs exhibit human-like response biases in survey questionnaires, based on the [original repo](https://github.com/lindiatjuatja/BiasMonkey).\n", + "\n", + "Here we analyze the responses of GPT2 and [CommunityLM models](https://github.com/neulab/llments/tree/main/examples/community_lm) as well as our [detoxified Llama-2-7b model](https://github.com/neulab/llments/tree/main/examples/detoxification_bias). If you haven't generated the responses yet, please run [bias_monkey.ipynb](./bias_monkey.ipynb).\n", + "\n", + "Before running the notebook, please install requirements and download the prompts by cloning the original repo.\n", + "\n", + "```bash\n", + "pip install -r requirements.txt\n", + "git clone https://github.com/lindiatjuatja/BiasMonkey\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from bias_monkey_utils import (\n", + " format_df,\n", + " plot_heatmap,\n", + " plot_uncertainity,\n", + " get_pearsonr,\n", + " compute_human_responses_distribution,\n", + " compute_model_responses_distribution,\n", + " compute_wasserstein_distance,\n", + " bias_types,\n", + ")\n", + "from tqdm import tqdm\n", + "from tqdm.contrib import itertools as tqdm_itertools" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Since we haven't generated responses from all the models yet, for the subsequent analysis\n", + "# we will use the responses included in the BiasMonkey repository.\n", + "results_dir = \"results/\"\n", + "models = [\n", + " \"llama2-7b\",\n", + " \"detoxified_llama2\",\n", + " \"gpt2\",\n", + " \"republican-twitter-gpt2\",\n", + " \"democrat-twitter-gpt2\",\n", + " \"llama2-7b-chat\",\n", + " \"llama2-13b-chat\",\n", + " \"llama2-70b-chat\",\n", + " \"gpt-3.5-turbo-instruct\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "464a250478c5438194f184273a6383db", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/45 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot heatmap comparing LLMs’ behavior on bias types with their respective behavior on the set of perturbations\n", + "# blue indicates a positive effect, orange indicates a negative effect, hatched cells indicate non-significant change\n", + "df_full = plot_heatmap(models, results_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1a5c4e3cd40e42c7986180ad42a8dc4e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/45 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_uncertainity = plot_uncertainity(models=models, results_dir=results_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "86440a48b83645d6a9fd0d45bb37391a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/45 [00:00 pd.DataFrame: """Generate responses to survey questions in prompts_file. @@ -266,7 +294,12 @@ def generate_survey_responses( max_attempts: The maximum number of attempts to generate valid responses. overwrite: Whether to overwrite the output file if it exists. prompt_template: The template for the prompt. + few_shot_examples: Few-shot examples to prepend to the prompt. + prefix_allowed_tokens_fn: this function constraints the beam search to allowed tokens only at each step. + This function takes 2 arguments: the batch ID and input_ids and returns a list with the allowed tokens for the next generation. """ + prompt_template = few_shot_examples + prompt_template + if seed is not None: model.set_seed(seed) @@ -317,12 +350,13 @@ def generate_survey_responses( responses = model.generate( prompt, do_sample=True, - max_new_tokens=2, + max_new_tokens=1, temperature=1.0, num_return_sequences=batch_size, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, ) answers = [ - r[len(prompt) :] if r.startswith(prompt) else r + r[len(prompt) :] if r.startswith(prompt.strip()) else r for r in responses ] num_attempts += len(answers) @@ -413,8 +447,12 @@ def plot_heatmap(models: list[str], results_dir: str) -> pd.DataFrame: lst = [model, clean_bias_labels[i], mean(values), p_value] for perturbation in perturbations: - if bias_types[i] == "opinion_float": # qustions are the same + # questions in odd_even and opinion_float are the same + csv_file = f"{results_dir}/{model}/csv/{bias_types[i]}{perturbation}.csv" + if bias_types[i] == "opinion_float" and not Path(csv_file).exists(): bias_type = "odd_even" + perturbation + elif bias_types[i] == "odd_even" and not Path(csv_file).exists(): + bias_type = "opinion_float" + perturbation else: bias_type = bias_types[i] + perturbation @@ -447,7 +485,10 @@ def plot_heatmap(models: list[str], results_dir: str) -> pd.DataFrame: models = list(models) + ["ideal"] clean_model_labels += ["Most Human-like"] - fig, axs = plt.subplots(2, len(models) // 2, figsize=(15, 6)) + # fig, axs = plt.subplots(2, len(models) // 2, figsize=(15, 6)) + nrows = (len(models) + 4) // 5 + fig, axs = plt.subplots(nrows, 5, figsize=(15, 3 * nrows)) + axs = np.atleast_2d(axs) cmap_name = "tab20c" @@ -680,8 +721,12 @@ def plot_uncertainity(models: list[str], results_dir: str) -> pd.DataFrame: ) lst = [model, bias_type, orig_mean, orig_std, new_mean, new_std] for perturbation in perturbations: - if bias_types[i] == "opinion_float": # qustions are the same + # questions in odd_even and opinion_float are the same + pkl_file = f"{results_dir}/{model}/{bias_types[i]}{perturbation}.pickle" + if bias_types[i] == "opinion_float" and not Path(pkl_file).exists(): bias_type = "odd_even" + perturbation + elif bias_types[i] == "odd_even" and not Path(pkl_file).exists(): + bias_type = "opinion_float" + perturbation else: bias_type = bias_types[i] + perturbation orig_mean, orig_std, new_mean, new_std = get_entropies(