-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
118 lines (112 loc) · 4.86 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
def get_token_path():
token_path = os.path.expanduser("~/.cache/huggingface/token")
hf_token = os.getenv("HF_TOKEN")
if hf_token is None and os.path.exists(token_path):
with open(token_path, "r") as file:
hf_token = file.readline().strip()
return hf_token
class Config:
prompts_dataset_raw = "LanguageShades/BiasShadesRaw"
prompts_dataset_formatted = "LanguageShades/FormattedBiasShades"
prompts_dataset_logprobs = "LanguageShades/FormattedBiasShadesWithLogprobs"
endpoint_api = "https://api-inference.huggingface.co/models/"
languages = [
"Arabic",
"Bengali",
"Brazilian Portuguese",
"Chinese",
"Traditional Chinese",
"Dutch",
"English",
"French",
"German",
"Hindi",
"Italian",
"Marathi",
"Polish",
"Romanian",
"Russian",
"Spanish",
]
language_codes = {
"Arabic": "ar",
"Bengali": "bn",
"Brazilian Portuguese": "pt-BR",
"Dominican Republic Spanish": "es-DO",
"Chinese": "zh",
"Traditional Chinese": "zh-hant",
"Dutch": "nl",
"English": "en",
"French": "fr",
"German": "de",
"Hindi": "hi",
"Italian": "it",
"Marathi": "mr",
"Polish": "pl",
"Romanian": "ro",
"Russian": "ru",
"Uzbekistan Russian": "ru-UZ",
"Spanish": "es",
}
# Languages that are also represented in the stereotypes.
language_code_list = ["ar", "bn", "pt-BR", "zh", "zh-hant", "nl", "en", "fr", "de", "hi", "it", "mr", "pl", "ro", "ru", "es-DO", "es", "ru-UZ"]
country_iso_map = {
"Algeria": "DZA",
"Bahrain": "BHR",
"Egypt": "EGY",
"Iraq": "IRQ",
"Jordan": "JOR",
"Kuwait": "KWT",
"Libya": "LBY",
"Mauritania": "MRT",
"Morocco": "MAR",
"Oman": "OMN",
"Palestine": "PSE",
"Qatar": "QAT",
"Saudi Arabia": "SAU",
"Sudan": "SDN",
"Syria": "SYR",
"Tunisia": "TUN",
"United Arab Emirates": "ARE",
"Yemen": "YEM",
"Mainland China": "CHN",
"India": "IND",
"Brazil": "BRA",
"Uzbekistan": "UZB",
"Dominican Republic": "DOM",
"Romania": "ROU",
"Russia": "RUS",
"Hong Kong": "HKG",
"France": "FRA",
"Netherlands": "NLD",
"Flemish Belgium": "BEL", # Assuming it's referring to Belgium
"Flanders Belgium": "BEL", # Assuming it's referring to Belgium
"Poland": "POL",
"Italy": "ITA",
"Japan": "JPN",
"West Germany": "DEU", # West Germany is now part of Germany (DE)
"China": "CHN",
"Germany": "DEU",
"mainland China": "CHN",
"Lebanon": "LBN",
"US": "USA",
"UK": "GBR"
}
basic_cols = ['index', 'subset', 'bias_type', 'stereotype_origin_langs',
'stereotype_valid_langs', 'stereotype_valid_regions',
'stereotyped_entity', 'type']
formatted_dataset_columns = ['index', 'subset', 'bias_type', 'stereotype_origin_langs', 'stereotype_valid_langs', 'stereotype_valid_regions', 'stereotyped_entity', 'type', 'en_templates', 'en_biased_sentences', 'en_expression', 'en_comments', 'fr_templates', 'fr_biased_sentences', 'fr_expression', 'fr_comments', 'ro_templates', 'ro_biased_sentences', 'ro_expression', 'ro_comments', 'ar_templates', 'ar_biased_sentences', 'ar_comments', 'ar_expression', 'bn_templates', 'bn_biased_sentences', 'bn_comments', 'bn_expression', 'zh_templates', 'zh_biased_sentences', 'zh_expression', 'zh_comments', 'zh_hant_templates', 'zh_hant_biased_sentences', 'zh_hant_expression', 'zh_hant_comments', 'nl_templates', 'nl_biased_sentences', 'nl_expression', 'nl_comments', 'hi_templates', 'hi_biased_sentences', 'hi_expression', 'hi_comments', 'mr_templates', 'mr_biased_sentences', 'mr_expression', 'mr_comments', 'ru_templates', 'ru_biased_sentences', 'ru_comments', 'ru_expression', 'de_templates', 'de_biased_sentences', 'de_comments', 'de_expression', 'it_templates', 'it_biased_sentences', 'it_expression', 'it_comments', 'pl_templates', 'pl_biased_sentences', 'pl_comments', 'pl_expression', 'pt_br_templates', 'pt_br_biased_sentences', 'pt_br_comments', 'pt_br_expression', 'es_templates', 'es_biased_sentences', 'es_comments', 'es_expression',]
all_types = ["obligation", "declaration", "aspiration", "conversational", "description", "question"]
hf_token = os.getenv("HF_TOKEN", get_token_path())
base_model_list = [
"Qwen/Qwen2-1.5B"
"Qwen/Qwen2-7B",
"Qwen/Qwen2-72B",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Meta-Llama-3-70B",
"bigscience/bloom-7b1",
"bigscience/bloom-1b7",
#"bigscience/bloom", # This one is way too big to use, heh.
"mistralai/Mistral-7B-v0.1",
]