Skip to content

Commit

Permalink
code
Browse files Browse the repository at this point in the history
  • Loading branch information
morrisalp committed Nov 27, 2023
1 parent 03e3267 commit bcfca95
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 0 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@

This is the official repository for the paper: *Morris Alper and Hadar Averbuch-Elor (2023). Kiki or Bouba? Sound Symbolism in Vision-and-Language Models. NeurIPS 2023*

## Code

All code has been tested with Python 3.9. After installing the required libraries (`pip install -r requirements.txt`) you may reproduce our results as follows:

* CLIP:
* `python src/clip_tests.py`
* Stable Diffusion (SD):
* `python src/sd_generate_images.py`
* `python src/sd_tests.py`

Note that the SD scripts must be run in order, as the first generates images which are used by the second.

## Licence

We release our code under the [MIT license](https://opensource.org/license/mit/).
Expand Down
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
numpy>=1.23.5
pandas>=2.1.1
scikit-learn>=1.0.2
torch>=2.0.0
tqdm>=4.65.0
transformers>=4.30.2
Binary file added src/__pycache__/config.cpython-38.pyc
Binary file not shown.
77 changes: 77 additions & 0 deletions src/clip_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from transformers import AutoTokenizer, CLIPTextModelWithProjection
from transformers import logging as transformers_logging
from utils import sharp_cats, round_cats, words, probe, is_word_round, is_word_sharp, sharp_words, round_words
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import kendalltau

def main():
transformers_logging.set_verbosity_error()

MODEL_ID = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
device = 'cuda'

print("Loading model...")
model = CLIPTextModelWithProjection.from_pretrained(MODEL_ID).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print("Model loaded")

### Geometric Scoring ###

def probe_s(word):
return probe(word, model, tokenizer, cats=sharp_cats)
def probe_r(word):
return probe(word, model, tokenizer, cats=round_cats)

scores = {}
for word in tqdm(words, desc="Calculating geometric scores"):
scores[word] = (probe_s(word), probe_r(word))

pdf = pd.DataFrame({'word': scores.keys()}) # pseudoword df
pdf['is_sharp'] = pdf.word.apply(is_word_sharp)
pdf['is_round'] = pdf.word.apply(is_word_round)
assert (pdf.is_sharp ^ pdf.is_round).all(), "Some word is neither sharp nor round."

pairs = pdf.word.map(scores)
pdf['s'] = pairs.apply(lambda x: x[0])
pdf['r'] = pairs.apply(lambda x: x[1])
pdf['delta'] = pdf.r - pdf.s

kd = probe_r('kiki') - probe_s('kiki')
bd = probe_r('bouba') - probe_s('bouba')
dPkb = (pdf.delta < bd).mean() - (pdf.delta < kd).mean()

auc = roc_auc_score(pdf.is_round, pdf.delta)
tau = kendalltau(pdf.is_round, pdf.delta).statistic

print('Geometric scoring metrics:')
print(f'\tAUC:\t{auc:.2f}')
print(f'\tTau:\t{tau:.2f}')
print(f'\tdPkb:\t{dPkb:.2f}')

### Phonetic Scoring ###

def score_adj(adj):
template = 'a 3D rendering of a {} shaped object'
return (
probe(adj, model, tokenizer, template=template, cats=round_words)
- probe(adj, model, tokenizer, template=template, cats=sharp_words)
)
adf = pd.DataFrame({ # adjective df
'adj': sharp_cats + round_cats,
'c': [0] * len(sharp_cats) + [1] * len(round_cats)
})
tqdm.pandas(desc="Calculating phonetic scores")
adf['score'] = adf.adj.progress_apply(score_adj)

auc_phon = roc_auc_score(adf.c, adf.score)
tau_phon = kendalltau(adf.c, adf.score).statistic

print('Phonetic scoring metrics:')
print(f'\tAUC:\t{auc_phon:.2f}')
print(f'\tTau:\t{tau_phon:.2f}')

if __name__ == "__main__":
main()
Empty file added src/sd_generate_images.py
Empty file.
Empty file added src/sd_tests.py
Empty file.
42 changes: 42 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import torch
import numpy as np

sharp_cats = 'sharp spiky angular jagged hard edgy pointed prickly rugged uneven'.split()
round_cats = 'round circular soft fat chubby curved smooth plush plump rotund'.split()

C = 'bdgktpslhmnx'
V = 'aeiou'
HARD_SOUNDS = set('ptkshixe')
SOFT_SOUNDS = set('bdglumno')
words = [f'{c1}{v1}{c2}{v2}{c1}{v1}' for c1 in C for c2 in C for v1 in V for v2 in V]
words = [w for w in words if (len(set(w) & HARD_SOUNDS) == 0) or (len(set(w) & SOFT_SOUNDS) == 0)]

def is_word_sharp(word):
return len(set(word) & SOFT_SOUNDS) == 0

def is_word_round(word):
return len(set(word) & HARD_SOUNDS) == 0

sharp_words = [w for w in words if is_word_sharp(w)]
round_words = [w for w in words if is_word_round(w)]

@torch.no_grad()
def probe(word, model, tokenizer, template='a 3D rendering of a {} object', cats=['sharp', 'round']):
prompts = [template.format(f'{word} shaped')] + [
template.format(c)
for c in cats
]
inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(model.device)
outputs = model(**inputs)
embs = outputs.text_embeds
embs /= embs.norm(dim=-1)[:, None]

v_mask = embs[0] # (512,)
v_prompts = embs[1:] # (k, 512)
scores = v_prompts @ v_mask # (k,)
scores = scores.cpu()
d = {
c: s.item()
for c, s in zip(cats, scores)
}
return np.mean(list(d.values()))

0 comments on commit bcfca95

Please sign in to comment.