code

TAU-VAILab · Nov 27, 2023 · bcfca95 · bcfca95
1 parent 03e3267
commit bcfca95
Show file tree

Hide file tree

Showing 7 changed files with 137 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,18 @@
 
 This is the official repository for the paper: *Morris Alper and Hadar Averbuch-Elor (2023). Kiki or Bouba? Sound Symbolism in Vision-and-Language Models. NeurIPS 2023*
 
+## Code
+
+All code has been tested with Python 3.9. After installing the required libraries (`pip install -r requirements.txt`) you may reproduce our results as follows:
+
+* CLIP:
+  * `python src/clip_tests.py`
+* Stable Diffusion (SD):
+  * `python src/sd_generate_images.py`
+  * `python src/sd_tests.py`
+
+Note that the SD scripts must be run in order, as the first generates images which are used by the second.
+
 ## Licence
 
 We release our code under the [MIT license](https://opensource.org/license/mit/).

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+numpy>=1.23.5
+pandas>=2.1.1
+scikit-learn>=1.0.2
+torch>=2.0.0
+tqdm>=4.65.0
+transformers>=4.30.2
diff --git a/src/__pycache__/config.cpython-38.pyc b/src/__pycache__/config.cpython-38.pyc
diff --git a/src/clip_tests.py b/src/clip_tests.py
@@ -0,0 +1,77 @@
+from transformers import AutoTokenizer, CLIPTextModelWithProjection
+from transformers import logging as transformers_logging
+from utils import sharp_cats, round_cats, words, probe, is_word_round, is_word_sharp, sharp_words, round_words
+import pandas as pd
+from tqdm.auto import tqdm
+import numpy as np
+from sklearn.metrics import roc_auc_score
+from scipy.stats import kendalltau
+
+def main():
+    transformers_logging.set_verbosity_error()
+
+    MODEL_ID = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+    device = 'cuda'
+
+    print("Loading model...")
+    model = CLIPTextModelWithProjection.from_pretrained(MODEL_ID).to(device).eval()
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    print("Model loaded")
+
+    ### Geometric Scoring ###
+
+    def probe_s(word):
+        return probe(word, model, tokenizer, cats=sharp_cats)
+    def probe_r(word):
+        return probe(word, model, tokenizer, cats=round_cats)
+
+    scores = {}
+    for word in tqdm(words, desc="Calculating geometric scores"):
+        scores[word] = (probe_s(word), probe_r(word))
+
+    pdf = pd.DataFrame({'word': scores.keys()}) # pseudoword df
+    pdf['is_sharp'] = pdf.word.apply(is_word_sharp)
+    pdf['is_round'] = pdf.word.apply(is_word_round)
+    assert (pdf.is_sharp ^ pdf.is_round).all(), "Some word is neither sharp nor round."
+
+    pairs = pdf.word.map(scores)
+    pdf['s'] = pairs.apply(lambda x: x[0])
+    pdf['r'] = pairs.apply(lambda x: x[1])
+    pdf['delta'] = pdf.r - pdf.s
+
+    kd = probe_r('kiki') - probe_s('kiki')
+    bd = probe_r('bouba') - probe_s('bouba')
+    dPkb = (pdf.delta < bd).mean() - (pdf.delta < kd).mean()
+
+    auc = roc_auc_score(pdf.is_round, pdf.delta)
+    tau = kendalltau(pdf.is_round, pdf.delta).statistic
+
+    print('Geometric scoring metrics:')
+    print(f'\tAUC:\t{auc:.2f}')
+    print(f'\tTau:\t{tau:.2f}')
+    print(f'\tdPkb:\t{dPkb:.2f}')
+
+    ### Phonetic Scoring ###
+
+    def score_adj(adj):
+        template = 'a 3D rendering of a {} shaped object'
+        return (
+            probe(adj, model, tokenizer, template=template, cats=round_words)
+            - probe(adj, model, tokenizer, template=template, cats=sharp_words)
+        )
+    adf = pd.DataFrame({ # adjective df
+        'adj': sharp_cats + round_cats,
+        'c': [0] * len(sharp_cats) + [1] * len(round_cats)
+    })
+    tqdm.pandas(desc="Calculating phonetic scores")
+    adf['score'] = adf.adj.progress_apply(score_adj)
+
+    auc_phon = roc_auc_score(adf.c, adf.score)
+    tau_phon = kendalltau(adf.c, adf.score).statistic
+
+    print('Phonetic scoring metrics:')
+    print(f'\tAUC:\t{auc_phon:.2f}')
+    print(f'\tTau:\t{tau_phon:.2f}')
+
+if __name__ == "__main__":
+    main()
diff --git a/src/sd_generate_images.py b/src/sd_generate_images.py
diff --git a/src/sd_tests.py b/src/sd_tests.py
diff --git a/src/utils.py b/src/utils.py
@@ -0,0 +1,42 @@
+import torch
+import numpy as np
+
+sharp_cats = 'sharp spiky angular jagged hard edgy pointed prickly rugged uneven'.split()
+round_cats = 'round circular soft fat chubby curved smooth plush plump rotund'.split()
+
+C = 'bdgktpslhmnx'
+V = 'aeiou'
+HARD_SOUNDS = set('ptkshixe')
+SOFT_SOUNDS = set('bdglumno')
+words = [f'{c1}{v1}{c2}{v2}{c1}{v1}' for c1 in C for c2 in C for v1 in V for v2 in V]
+words = [w for w in words if (len(set(w) & HARD_SOUNDS) == 0) or (len(set(w) & SOFT_SOUNDS) == 0)]
+
+def is_word_sharp(word):
+    return len(set(word) & SOFT_SOUNDS) == 0
+
+def is_word_round(word):
+    return len(set(word) & HARD_SOUNDS) == 0
+
+sharp_words = [w for w in words if is_word_sharp(w)]
+round_words = [w for w in words if is_word_round(w)]
+
+@torch.no_grad()
+def probe(word, model, tokenizer, template='a 3D rendering of a {} object', cats=['sharp', 'round']):
+    prompts = [template.format(f'{word} shaped')] + [
+        template.format(c)
+        for c in cats
+    ]
+    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(model.device)
+    outputs = model(**inputs)
+    embs = outputs.text_embeds
+    embs /= embs.norm(dim=-1)[:, None]
+
+    v_mask = embs[0] # (512,)
+    v_prompts = embs[1:] # (k, 512)
+    scores = v_prompts @ v_mask # (k,)
+    scores = scores.cpu()
+    d = {
+        c: s.item()
+        for c, s in zip(cats, scores)
+    }
+    return np.mean(list(d.values()))