-
Notifications
You must be signed in to change notification settings - Fork 0
/
constants.py
149 lines (120 loc) · 3.63 KB
/
constants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python3
# Copyright (c) # Copyright (c) ACL 2024, Natural Language Reasoning and Structured Explanations Workshop
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional
class UseRef(Enum):
YES = "with_ref"
NO = "no_ref"
@dataclass
class Example:
context: str
hypo: str
ref: Optional[str] = None
@dataclass
class ScoreMe:
hypos: List[str]
context_ref: List[str]
use_ref: UseRef
def __init__(self, exs: List[Example], use_ref: UseRef):
self.use_ref = use_ref
if self.use_ref is UseRef.YES:
self.context_ref = [x.ref for x in exs]
else:
self.context_ref = [x.context for x in exs]
assert len(self.context_ref) > 0 and self.context_ref[0] is not None
self.hypos = [x.hypo for x in exs]
MAX_RETRY = 3
API_TIME_INTERVAL = 2.0
TEMP_FOR_MULTI_CHAINS = 0.7
N_FOR_MULTI_CHAINS = 10
MAX_TOKEN_FOR_SELFVERIFY = 180
STR_GEN_STOP = "\n\n" # "\n"
DICT_STR_SPLIT_RATIONALE = {"\n\n": "\n", "\n": ". "}
STR_TRIGGER_RESTORE = "restore"
COT_TRIGGER = "Let's think step by step."
DIRECT_TRIGGER = "Please provide a direct answer without additional explanation."
PROMPT_NAME_ZSL = "NoNeed"
N = 10
M = 3
THRESHOLD_1 = 1.0 / (1 + M * (N-2) / N)
THRESHOLD_2 = 1.0 / (1 + 1 / N)
M_CSQA = 10
THRESHOLD_1_CSQA = 1.0 / (1 + M_CSQA * (N-2) / N)
# SCORES
ROUGE_1 = "rouge_1"
ROUGE_2 = "rouge_2"
ROUGE_L = "rouge_l"
BLEURT = "bleurt"
BERTSCORE_F = "bertScore_f"
BARTSCORE_F = "bartScore_f"
BARTSCORE_CNN_F = "bartScore_cnn_f"
BARTSCORE_CNN_PARA_F = "bartscore_cnn_para_f"
BARTSCORE_FINETUNED_F = "bartscore_finetuned_f"
# note: we're actually using PRISM where it changes underlying behavior depending on references or not
PRISM_AVG = "prism_avg"
CTC_RELEVANCE_SUMMARY = "ctc_relevance_summary"
CTC_CONSISTENCY_SUMMARY = "ctc_consistency_summary"
BASELINE_SCORES = [ # Use this to hide metrics we don't want to use anymore
ROUGE_1,
ROUGE_2,
ROUGE_L,
BLEURT,
BERTSCORE_F,
BARTSCORE_F,
# BARTSCORE_CNN_F,
BARTSCORE_CNN_PARA_F,
BARTSCORE_FINETUNED_F,
PRISM_AVG,
CTC_RELEVANCE_SUMMARY,
CTC_CONSISTENCY_SUMMARY,
]
# Paths
DEFAULT_INPUT_ANN_PATH = f"./data/annotated/"
DEFAULT_INPUT_GEN_PATH = f"./data/generated/"
DEFAULT_INPUT_RES_PATH = f"./data/restored/"
DEFAULT_PROMPT_PATH = f"./prompts/"
DEFAULT_OUTPUT_PATH = f"./scores/"
DEFAULT_LOG_PATH = f"./log/"
DEFAULT_INPUT_PATH = f"./data/restored"
# Datasets
INPUT_DATA_FILES_HUMAN = {
# "drop": f"{DEFAULT_INPUT_PATH}/drop.json",
# "esnli": f"{DEFAULT_INPUT_PATH}/esnli.json",
# "cosmos": f"{DEFAULT_INPUT_PATH}/cosmos.json",
# "semeval": f"{DEFAULT_INPUT_PATH}/semevalcommonsense.json",
"gsm8k": f"{DEFAULT_INPUT_PATH}/gsm8k.jsonl",
"svamp": f"{DEFAULT_INPUT_PATH}/svamp.jsonl",
"multiarith": f"{DEFAULT_INPUT_PATH}/multiarith.jsonl",
"mathqa": f"{DEFAULT_INPUT_PATH}/mathqa.jsonl",
"csqa": f"{DEFAULT_INPUT_PATH}/csqa.jsonl",
# "strategyqa": f"{DEFAULT_INPUT_PATH}/strategyqa.jsonl",
}
INPUT_DATA_HUMAN = list(INPUT_DATA_FILES_HUMAN.keys())
INPUT_DATA_SYNTHETIC = [
"aqua",
"asdiv",
"entailment_bank",
"eqasc",
"math",
"proofwriter",
# "strategy_qa", # used for train + valid only
]
# DATASETS = INPUT_DATA_HUMAN + INPUT_DATA_SYNTHETIC
DATASETS = INPUT_DATA_HUMAN
# Perturbations
PERTURBATIONS = [
"ShuffleSteps",
"DuplicateOneStep",
"RemoveOneStep",
"SwapOneStep",
"ExtrinsicHallucinatedStep",
"ParaphraseSteps",
"GrammaticalErrorStep",
"NegateStep",
"SemanticChangeStep",
"ShuffleNumbers",
"ShuffleOperations",
"RandomNumber",
"RandomOperation",
]