diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Arabic.py
new file mode 100755
index 00000000..a115fa49
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Arabic.py
@@ -0,0 +1,87 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "حدد ما إذا كانت المعلومات في الجملة صحيحة أم خاطئة. أجب فقط بـ 'صحيحة' أو 'خاطئة'.\n\n"
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في التحقق من الحقائق.",
+ },
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ label = "صحيحة" if example["label"] == "true" else "خاطئة"
+
+ out_prompt = (
+ out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ input_label = response["choices"][0]["message"]["content"]
+ input_label = input_label.replace(".", "").strip().lower()
+
+ if (
+ "true" in input_label
+ or "yes" in input_label
+ or "right" in input_label
+ or "صحيح" in input_label
+ or "صحيحة" in input_label
+ or "نعم" in input_label
+ ):
+ return "true"
+
+ elif (
+ "false" in input_label
+ or "wrong" in input_label
+ or "خطأ" in input_label
+ or "لا" in input_label
+ or "not" in input_label
+ or "خاطئة" in input_label
+ or "خاطئ" in input_label
+ ):
+ return "false"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_English.py
new file mode 100755
index 00000000..2253ecfb
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_English.py
@@ -0,0 +1,87 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "Detect whether the information in the sentence is factually true or false. Answer only by true or false.\n\n"
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "system",
+ "content": "You are an expert fact checker.",
+ },
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ label = example["label"]
+
+ out_prompt = (
+ out_prompt + "sentence: " + sent + "\n" + "label: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "sentence: " + input_sample + "\nlabel: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ input_label = response["choices"][0]["message"]["content"]
+ input_label = input_label.replace(".", "").strip().lower()
+
+ if (
+ "true" in input_label
+ or "yes" in input_label
+ or "right" in input_label
+ or "صحيح" in input_label
+ or "صحيحة" in input_label
+ or "نعم" in input_label
+ ):
+ return "true"
+
+ elif (
+ "false" in input_label
+ or "wrong" in input_label
+ or "خطأ" in input_label
+ or "لا" in input_label
+ or "not" in input_label
+ or "خاطئة" in input_label
+ or "خاطئ" in input_label
+ ):
+ return "false"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Mixed.py
new file mode 100755
index 00000000..1655c2a1
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Mixed.py
@@ -0,0 +1,87 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "حدد ما إذا كانت المعلومات في الجملة صحيحة أم خاطئة. أجب فقط بـ 'true' أو 'false'.\n\n"
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في التحقق من الحقائق.",
+ },
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ label = example["label"]
+
+ out_prompt = (
+ out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "الجملة: " + input_sample + "التصنيف: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ input_label = response["choices"][0]["message"]["content"]
+ input_label = input_label.replace(".", "").strip().lower()
+
+ if (
+ "true" in input_label
+ or "yes" in input_label
+ or "right" in input_label
+ or "صحيح" in input_label
+ or "صحيحة" in input_label
+ or "نعم" in input_label
+ ):
+ return "true"
+
+ elif (
+ "false" in input_label
+ or "wrong" in input_label
+ or "خطأ" in input_label
+ or "لا" in input_label
+ or "not" in input_label
+ or "خاطئة" in input_label
+ or "خاطئ" in input_label
+ ):
+ return "false"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Arabic.py
new file mode 100755
index 00000000..ff1fe4ec
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Arabic.py
@@ -0,0 +1,73 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_text = (
+ "حدد ما إذا كانت المعلومات في الجملة صحيحة أم خاطئة. "
+ "أجب فقط بـ 'صحيحة' أو 'خاطئة'.\n\n"
+ + "الجملة: "
+ + input_sample
+ + "\nالتصنيف: \n"
+ )
+
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في التحقق من الحقائق.",
+ },
+ {
+ "role": "user",
+ "content": prompt_text,
+ },
+ ]
+
+
+def post_process(response):
+ input_label = response["choices"][0]["message"]["content"]
+ input_label = input_label.replace(".", "").strip().lower()
+
+ if (
+ "true" in input_label
+ or "yes" in input_label
+ or "right" in input_label
+ or "صحيح" in input_label
+ or "صحيحة" in input_label
+ or "نعم" in input_label
+ ):
+ return "true"
+
+ elif (
+ "false" in input_label
+ or "wrong" in input_label
+ or "خطأ" in input_label
+ or "لا" in input_label
+ or "not" in input_label
+ or "خاطئة" in input_label
+ or "خاطئ" in input_label
+ ):
+ return "false"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_English.py
new file mode 100755
index 00000000..fc513431
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_English.py
@@ -0,0 +1,72 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_text = (
+ "Detect whether the information in the sentence is factually true or false. Answer only by true or false.\n\n"
+ + "sentence: "
+ + input_sample
+ + "\nlabel: \n"
+ )
+
+ return [
+ {
+ "role": "system",
+ "content": "You are an expert fact checker.",
+ },
+ {
+ "role": "user",
+ "content": prompt_text,
+ },
+ ]
+
+
+def post_process(response):
+ input_label = response["choices"][0]["message"]["content"]
+ input_label = input_label.replace(".", "").strip().lower()
+
+ if (
+ "true" in input_label
+ or "yes" in input_label
+ or "right" in input_label
+ or "صحيح" in input_label
+ or "صحيحة" in input_label
+ or "نعم" in input_label
+ ):
+ return "true"
+
+ elif (
+ "false" in input_label
+ or "wrong" in input_label
+ or "خطأ" in input_label
+ or "لا" in input_label
+ or "not" in input_label
+ or "خاطئة" in input_label
+ or "خاطئ" in input_label
+ ):
+ return "false"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Mixed.py
new file mode 100755
index 00000000..6140335a
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Mixed.py
@@ -0,0 +1,73 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_text = (
+ "حدد ما إذا كانت المعلومات في الجملة صحيحة أم خاطئة. "
+ "أجب فقط بـ 'true' أو 'false'.\n\n"
+ + "الجملة: "
+ + input_sample
+ + "\nالتصنيف: \n"
+ )
+
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في التحقق من الحقائق.",
+ },
+ {
+ "role": "user",
+ "content": prompt_text,
+ },
+ ]
+
+
+def post_process(response):
+ input_label = response["choices"][0]["message"]["content"]
+ input_label = input_label.replace(".", "").strip().lower()
+
+ if (
+ "true" in input_label
+ or "yes" in input_label
+ or "right" in input_label
+ or "صحيح" in input_label
+ or "صحيحة" in input_label
+ or "نعم" in input_label
+ ):
+ return "true"
+
+ elif (
+ "false" in input_label
+ or "wrong" in input_label
+ or "خطأ" in input_label
+ or "لا" in input_label
+ or "not" in input_label
+ or "خاطئة" in input_label
+ or "خاطئ" in input_label
+ ):
+ return "false"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Arabic.py
new file mode 100755
index 00000000..4d3b37ee
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Arabic.py
@@ -0,0 +1,89 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": FastChatModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'نعم' إذا كانت صحيحة و'لا' إذا لم تكن صحيحة. "
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ if example["label"] == "true":
+ label = "نعم"
+ elif example["label"] == "false":
+ label = "لا"
+ out_prompt = (
+ out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace(".", "").strip().lower()
+ if "آسف" in label or "لا أستطيع" in label:
+ return None
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "label: 0" in label
+ or "label: no" in label
+ or "غير صحيح" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "label: 1" in label
+ or "label: yes" in label
+ or "صحيح" in label
+ or "صح" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_English.py
new file mode 100755
index 00000000..c418520c
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_English.py
@@ -0,0 +1,83 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": FastChatModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "Detect whether the information in the sentence is factually true or false. Answer only by true or false.\n\n"
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ label = example["label"]
+
+ out_prompt = (
+ out_prompt + "Sentence: " + sent + "\n" + "label: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "Sentence: " + input_sample + "\nlabel: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace(".", "").strip().lower()
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "label: 0" in label
+ or "label: no" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "yes" in label
+ or "label: 1" in label
+ or "label: yes" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Mixed.py
new file mode 100755
index 00000000..23cdda1e
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Mixed.py
@@ -0,0 +1,84 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": FastChatModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'true' إذا كانت صحيحة و'false' إذا لم تكن صحيحة. "
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ if example["label"] == "true":
+ label = "true"
+ elif example["label"] == "false":
+ label = "false"
+ out_prompt = (
+ out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace(".", "").strip().lower()
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "label: 0" in label
+ or "label: no" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "label: 1" in label
+ or "label: yes" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..b002cbee
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Arabic.py
@@ -0,0 +1,67 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": FastChatModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_text = (
+ "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'نعم' إذا كانت صحيحة و'لا' إذا لم تكن صحيحة. "
+ + "الجملة: "
+ + input_sample
+ + "\التصنيف: \n"
+ )
+
+ return [
+ {
+ "role": "user",
+ "content": prompt_text,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace(".", "").strip().lower()
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "label: 0" in label
+ or "label: no" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "label: 1" in label
+ or "label: yes" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_English.py
new file mode 100755
index 00000000..986f1274
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_English.py
@@ -0,0 +1,64 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": FastChatModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample):
+ base_prompt = (
+ "Detect whether the information in the sentence is factually true or false. "
+ "Answer only by true or false.\n\n"
+ + "Sentence: "
+ + input_sample
+ + "\nlabel: \n"
+ )
+
+ return [
+ {
+ "role": "user",
+ "content": base_prompt,
+ },
+ ]
+
+
+def post_process(response):
+ input_label = response["choices"][0]["message"]["content"]
+ input_label = input_label.replace(".", "").strip().lower()
+
+ if (
+ "true" in input_label
+ or "yes" in input_label
+ or "label: 1" in input_label
+ or "label: yes" in input_label
+ ):
+ pred_label = "true"
+ elif (
+ "false" in input_label
+ or "label: 0" in input_label
+ or "label: no" in input_label
+ ):
+ pred_label = "false"
+ else:
+ print("label problem!! " + input_label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..469ee3eb
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Mixed.py
@@ -0,0 +1,67 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": FastChatModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_text = (
+ "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'true' إذا كانت صحيحة و'false' إذا لم تكن صحيحة. "
+ + "الجملة: "
+ + input_sample
+ + "\nالتصنيف: \n"
+ )
+
+ return [
+ {
+ "role": "user",
+ "content": prompt_text,
+ }
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace(".", "").strip().lower()
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "label: 0" in label
+ or "label: no" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "label: 1" in label
+ or "label: yes" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Arabic.py
new file mode 100755
index 00000000..e72f2575
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Arabic.py
@@ -0,0 +1,92 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": AzureModel,
+ "model_args": {
+ "max_tries": 100,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'نعم' إذا كانت صحيحة و'لا' إذا لم تكن صحيحة."
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ if example["label"] == "true":
+ label = "نعم"
+ elif example["label"] == "false":
+ label = "لا"
+ out_prompt = (
+ out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+
+ label = label.replace(".", "").strip().lower()
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "label: 0" in label
+ or "label: no" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "label: 1" in label
+ or "label: yes" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_English.py
new file mode 100755
index 00000000..30bced14
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_English.py
@@ -0,0 +1,77 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": AzureModel,
+ "model_args": {
+ "max_tries": 100,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "Detect whether the information in the sentence is factually true or false. Answer only by true or false.\n\n"
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ label = example["label"]
+
+ out_prompt = (
+ out_prompt + "Sentence: " + sent + "\n" + "label: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "Sentence: " + input_sample + "\nlabel: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+
+ label = label.replace(".", "").strip().lower()
+
+ if "true" in label or "label: 1" in label or "label: yes" in label:
+ pred_label = "true"
+ elif "false" in label or "label: 0" in label or "label: no" in label:
+ pred_label = "false"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Mixed.py
new file mode 100755
index 00000000..18a92734
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Mixed.py
@@ -0,0 +1,92 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": AzureModel,
+ "model_args": {
+ "max_tries": 100,
+ },
+ }
+
+
+def prompt(input_sample, examples):
+ prompt_text = "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'true' إذا كانت صحيحة و'false' إذا لم تكن صحيحة. "
+
+ fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+ return [
+ {
+ "role": "user",
+ "content": fs_prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt
+ for example in examples:
+ sent = example["input"]
+ if example["label"] == "true":
+ label = "true"
+ elif example["label"] == "false":
+ label = "false"
+ out_prompt = (
+ out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+
+ label = label.replace(".", "").strip().lower()
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "label: 0" in label
+ or "label: no" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "label: 1" in label
+ or "label: yes" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..2cd40497
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Arabic.py
@@ -0,0 +1,75 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": AzureModel,
+ "model_args": {
+ "max_tries": 100,
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_text = (
+ "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'نعم' إذا كانت صحيحة و'لا' إذا لم تكن صحيحة. "
+ + "الجملة: "
+ + input_sample
+ + "\التصنيف: \n"
+ )
+
+ return [
+ {
+ "role": "user",
+ "content": prompt_text,
+ },
+ ]
+
+
+def post_process(response):
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+
+ label = label.replace(".", "").strip().lower()
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "خطا" in label
+ or "label: no" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "صح" in label
+ or "label: yes" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_English.py
new file mode 100755
index 00000000..574b5ce9
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_English.py
@@ -0,0 +1,62 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": AzureModel,
+ "model_args": {
+ "max_tries": 100,
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_text = (
+ "Detect whether the information in the sentence is factually true or false. "
+ "Answer only by 'true' or 'false'.\n\n"
+ + "Sentence: "
+ + input_sample
+ + "\nlabel: \n"
+ )
+
+ return [
+ {
+ "role": "user",
+ "content": prompt_text,
+ },
+ ]
+
+
+def post_process(response):
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+
+ label = label.replace(".", "").strip().lower()
+ if "true" in label or "label: 1" in label or "label: yes" in label:
+ pred_label = "true"
+ elif "false" in label or "label: 0" in label or "label: no" in label:
+ pred_label = "false"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..3cd6c987
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Mixed.py
@@ -0,0 +1,75 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": ANSFactualityDataset,
+ "task": FactualityTask,
+ "model": AzureModel,
+ "model_args": {
+ "max_tries": 100,
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_text = (
+ "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'true' إذا كانت صحيحة و'false' إذا لم تكن صحيحة. "
+ + "الجملة: "
+ + input_sample
+ + "\nالتصنيف: \n"
+ )
+
+ return [
+ {
+ "role": "user",
+ "content": prompt_text,
+ }
+ ]
+
+
+def post_process(response):
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+
+ label = label.replace(".", "").strip().lower()
+
+ if (
+ "لا" in label
+ or "خطأ" in label
+ or "ليست" in label
+ or "false" in label
+ or "label: 0" in label
+ or "label: no" in label
+ ):
+ pred_label = "false"
+ elif (
+ "نعم" in label
+ or "صحيحة" in label
+ or "true" in label
+ or "label: 1" in label
+ or "label: yes" in label
+ ):
+ pred_label = "true"
+ else:
+ print("label problem!! " + label)
+ pred_label = None
+
+ return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Arabic.py
new file mode 100755
index 00000000..187661c9
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Arabic.py
@@ -0,0 +1,85 @@
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def prompt(input_sample, examples):
+ base_prompt = "هل قد تتسبب هذه التغريدة بأي ضرر أو أذى لشخص أو مؤسسة؟ إن كانت كذلك، أجب بـ 'ضارة'. أجب بـ 'غير ضارة' إن لم تكن كذلك. قدم فقط التصنيف.\n"
+ prompt = few_shot_prompt(input_sample, base_prompt, examples)
+
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+ },
+ {
+ "role": "user",
+ "content": prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Found chatgpt confused when using 0 and 1 in the prompt
+ label = "غير ضارة" if example["label"] == "0" else "ضارة"
+ out_prompt = (
+ out_prompt + "التغريدة: " + example["input"] + "التصنيف: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "التغريدة: " + input_sample + "التصنيف: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label_fixed = None
+ label = label.replace("label:", "")
+ label = label.replace("التصنيف:", "")
+ label = label.strip().lower()
+
+ if (
+ "غير ضارة" in label
+ or "ليست ضارة" in label
+ or "غير ضاره" in label
+ or "غير" in label
+ or "not" in label
+ or "ليست" in label
+ or "لا" in label
+ or "not harmful" in label
+ or label.startswith("no")
+ or "safe" in label
+ or "not_harmful" in label
+ ):
+ label_fixed = "0"
+ elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+ label_fixed = "1"
+
+ return label_fixed
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_English.py
new file mode 100755
index 00000000..4915b237
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_English.py
@@ -0,0 +1,84 @@
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def prompt(input_sample, examples):
+ base_prompt = "Can this tweet cause harm to any person, organization or entity? Answer only by 'harmful' or 'not harmful'. Provide only label.\n"
+ prompt = few_shot_prompt(input_sample, base_prompt, examples)
+
+ return [
+ {
+ "role": "system",
+ "content": "You are an expert in tweet analysis and classification.",
+ },
+ {
+ "role": "user",
+ "content": prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Found chatgpt confused when using 0 and 1 in the prompt
+ label = "harmful" if example["label"] == "1" else "not harmful"
+ out_prompt = (
+ out_prompt + "tweet: " + example["input"] + "\nlabel: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the label blank
+ out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace("label:", "")
+ label = label.replace("التصنيف:", "")
+ label = label.strip().lower()
+
+ if (
+ "غير ضارة" in label
+ or "ليست ضارة" in label
+ or "غير ضاره" in label
+ or "غير" in label
+ or "not" in label
+ or "ليست" in label
+ or "لا" in label
+ or "not harmful" in label
+ or label.startswith("no")
+ or "safe" in label
+ or "not_harmful" in label
+ ):
+ return "0"
+ elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+ return "1"
+
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Mixed.py
new file mode 100755
index 00000000..ffe298b4
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Mixed.py
@@ -0,0 +1,85 @@
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def prompt(input_sample, examples):
+ base_prompt = "هل قد تتسبب هذه التغريدة بأي ضرر أو أذى لشخص أو مؤسسة؟ إن كانت كذلك، أجب بـ 'harmful'. أجب بـ 'not harmful' إن لم تكن كذلك. قدم فقط التصنيف.\n"
+ prompt = few_shot_prompt(input_sample, base_prompt, examples)
+
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+ },
+ {
+ "role": "user",
+ "content": prompt,
+ },
+ ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Found chatgpt confused when using 0 and 1 in the prompt
+ label = "not harmful" if example["label"] == "0" else "harmful"
+ out_prompt = (
+ out_prompt + "التغريدة: " + example["input"] + "التصنيف: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "التغريدة: " + input_sample + "التصنيف: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label_fixed = None
+ label = label.replace("label:", "")
+ label = label.replace("التصنيف:", "")
+ label = label.strip().lower()
+
+ if (
+ "غير ضارة" in label
+ or "ليست ضارة" in label
+ or "غير ضاره" in label
+ or "غير" in label
+ or "not" in label
+ or "ليست" in label
+ or "لا" in label
+ or "not harmful" in label
+ or label.startswith("no")
+ or "safe" in label
+ or "not_harmful" in label
+ ):
+ return "0"
+ elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+ return "1"
+
+ return label_fixed
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Arabic.py
new file mode 100755
index 00000000..0f5e05b0
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Arabic.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ prompt_string = (
+ f"هل قد تتسبب هذه التغريدة بأي ضرر أو أذى لشخص أو مؤسسة؟ إن كانت كذلك، أجب بـ 'ضارة'. أجب بـ 'غير ضارة' إن لم تكن كذلك. قدم فقط التصنيف.\n\n"
+ f"التغريدة: {input_sample}\n"
+ f"التصنيف: \n"
+ )
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+ },
+ {
+ "role": "user",
+ "content": prompt_string,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace("label:", "")
+ label = label.replace("التصنيف:", "")
+ label = label.strip().lower()
+
+ if (
+ "غير ضارة" in label
+ or "ليست ضارة" in label
+ or "غير ضاره" in label
+ or "غير" in label
+ or "not" in label
+ or "ليست" in label
+ or "لا" in label
+ or "not harmful" in label
+ or label.startswith("no")
+ or "safe" in label
+ or "not_harmful" in label
+ ):
+ return "0"
+ elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+ return "1"
+
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_English.py
new file mode 100755
index 00000000..cac11ed7
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_English.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ prompt_string = (
+ f"Can this tweet cause harm to any person, organization or entity? Answer only by 'harmful' or 'not harmful'. Provide only label.\n\n"
+ f"tweet: {input_sample}\n"
+ f"label: \n"
+ )
+ return [
+ {
+ "role": "system",
+ "content": "You are an expert in tweet analysis and classification.",
+ },
+ {
+ "role": "user",
+ "content": prompt_string,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace("label:", "")
+ label = label.replace("التصنيف:", "")
+ label = label.strip().lower()
+
+ if (
+ "غير ضارة" in label
+ or "ليست ضارة" in label
+ or "غير ضاره" in label
+ or "غير" in label
+ or "not" in label
+ or "ليست" in label
+ or "لا" in label
+ or "not harmful" in label
+ or label.startswith("no")
+ or "safe" in label
+ or "not_harmful" in label
+ ):
+ return "0"
+ elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+ return "1"
+
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Mixed.py
new file mode 100755
index 00000000..13196db3
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Mixed.py
@@ -0,0 +1,72 @@
+import re
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "GPT-4o-2024-05-22",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ prompt_string = (
+ f"هل قد تتسبب هذه التغريدة بأي ضرر أو أذى لشخص أو مؤسسة؟ إن كانت كذلك، أجب بـ 'harmful'. أجب بـ 'not harmful' إن لم تكن كذلك. قدم فقط التصنيف.\n\n"
+ f"التغريدة: {input_sample}\n"
+ f"التصنيف: \n"
+ )
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+ },
+ {
+ "role": "user",
+ "content": prompt_string,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label_fixed = None
+ label = label.replace("label:", "")
+ label = label.replace("التصنيف:", "")
+ label = label.strip().lower()
+
+ if (
+ "غير ضارة" in label
+ or "ليست ضارة" in label
+ or "غير ضاره" in label
+ or "غير" in label
+ or "not" in label
+ or "ليست" in label
+ or "لا" in label
+ or "not harmful" in label
+ or label.startswith("no")
+ or "safe" in label
+ or "not_harmful" in label
+ ):
+ return "0"
+ elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+ return "1"
+
+ return label_fixed
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Arabic.py
new file mode 100755
index 00000000..9e9be164
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Arabic.py
@@ -0,0 +1,67 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Translate labels to Arabic
+ label = "غير ضارة" if example["label"] == "0" else "ضارة"
+ out_prompt += "التغريدة: " + example["input"] + "\nالتصنيف: " + label + "\n\n"
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt += "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = "صنف التغريدة التالية على أنها 'ضارة' أو 'غير ضارة'. قدم التصنيف فقط."
+ return [
+ {
+ "role": "user",
+ "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+ }
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"].lower()
+ print("label: ", label)
+ if "لا يمكنني" in label or "ai" in label:
+ return None
+ if (
+ "غير ضارة" in label
+ or "لا يحتوي على محتوى ضار" in label
+ or "غير" in label
+ or "ليست" in label
+ ):
+ return "0"
+
+ else:
+ return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_English.py
new file mode 100755
index 00000000..c26b458a
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_English.py
@@ -0,0 +1,75 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Found chatgpt confused when using 0 and 1 in the prompt
+ label = "not_harmful" if example["label"] == "0" else "harmful"
+ out_prompt = (
+ out_prompt + "tweet: " + example["input"] + "\nlabel: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = "Classify the following tweet as 'harmful' or 'not_harmful'. Provide only label."
+ return [
+ {
+ "role": "user",
+ "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+ }
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"].lower()
+ print("label: ", label)
+ if "label: harmful" in label or "is harmful" in label:
+ return "1"
+ if "لا يمكنني" in label or "ai" in label or "لا يمكن" in label:
+ return None
+ if (
+ "غير ضارة" in label
+ or "لا يحتوي على محتوى ضار" in label
+ or "غير" in label
+ or "ليست" in label
+ or "not" in label
+ ):
+ return "0"
+
+ else:
+ return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Mixed.py
new file mode 100755
index 00000000..9fe4d0e3
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Mixed.py
@@ -0,0 +1,89 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Translate labels to Arabic
+ label = "not_harmful" if example["label"] == "0" else "harmful"
+ out_prompt += "التغريدة: " + example["input"] + "\nالتصنيف: " + label + "\n\n"
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt += "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = (
+ "صنف التغريدة التالية على أنها 'harmful' أو 'not_harmful'. قدم التصنيف فقط.\n\n"
+ )
+ return [
+ {
+ "role": "user",
+ "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+ }
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"].lower()
+ print("label: ", label)
+ if (
+ "لا يمكنني" in label
+ or "ai" in label
+ or "لا يمكن" in label
+ or "ما هي المعايير" in label
+ ):
+ return None
+ if "التغريدة الأخيرة هي 'harmful'" in label:
+ return "1"
+ if "التغريدة الثالثة مصنفة على أنها 'غير مؤذية'" in label:
+ return "0"
+ if (
+ "التغريدة الثالثة مصنفة على أنها 'مؤذية'" in label
+ or "غير آمن" in label
+ or "لأنها لا تحتوي على محتوى ضار" in label
+ ):
+ return "1"
+
+ if "label: harmful" in label or "is harmful" in label:
+ return "1"
+
+ if (
+ "غير ضارة" in label
+ or "لا يحتوي على محتوى ضار" in label
+ or "غير" in label
+ or "ليس" in label
+ or "no" in label
+ ):
+ return "0"
+
+ else:
+ return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..42592efa
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Arabic.py
@@ -0,0 +1,58 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ return [
+ {
+ "role": "user",
+ "content": (
+ "صنف التغريدة التالية على أنها 'ضارة' أو 'غير ضارة'. قدم التصنيف فقط.\n\n"
+ + "تغريدة: "
+ + input_sample
+ + "\التصنيف: "
+ ),
+ }
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"].lower()
+ print("label: ", label)
+ if "لا يمكنني" in label or "ai" in label:
+ return None
+ if (
+ "غير ضارة" in label
+ or "لا يحتوي على محتوى ضار" in label
+ or "غير" in label
+ or "ليست" in label
+ ):
+ return "0"
+
+ else:
+ return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_English.py
new file mode 100755
index 00000000..273545b3
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_English.py
@@ -0,0 +1,59 @@
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ base_prompt = (
+ f"Classify the following tweet as 'harmful' or 'not_harmful'. Provide only label.\n\n"
+ f"tweet: {input_sample}\n"
+ f"label: \n"
+ )
+ return [
+ {
+ "role": "user",
+ "content": base_prompt,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"].lower()
+ print("label: ", label)
+ if "label: harmful" in label or "is harmful" in label:
+ return "1"
+ if "لا يمكنني" in label or "ai" in label or "لا يمكن" in label:
+ return None
+ if (
+ "غير ضارة" in label
+ or "لا يحتوي على محتوى ضار" in label
+ or "غير" in label
+ or "ليست" in label
+ or "not" in label
+ ):
+ return "0"
+
+ else:
+ return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..8fd926a5
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Mixed.py
@@ -0,0 +1,59 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "jais-13b-chat",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ return [
+ {
+ "role": "user",
+ "content": (
+ "صنف التغريدة التالية على أنها 'harmful' أو 'not_harmful'. قدم التصنيف فقط.\n\n"
+ + "تغريدة: "
+ + input_sample
+ + "التصنيف: "
+ ),
+ }
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"].lower()
+
+ if "غير" in label or "ليس" in label or "not" in label:
+ return "0"
+ elif (
+ "ضار" in label
+ or "مس" in label
+ or "مؤ" in label
+ or "harm" in label
+ or "مض" in label
+ ):
+ return "1"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Arabic.py
new file mode 100755
index 00000000..26d6281c
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Arabic.py
@@ -0,0 +1,66 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Translate labels to Arabic
+ label = "غير ضارة" if example["label"] == "0" else "ضارة"
+ out_prompt += "التغريدة: " + example["input"] + "\nالتصنيف: " + label + "\n\n"
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt += "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = "صنف التغريدة التالية على أنها 'ضارة' أو 'غير ضارة'. قدم التصنيف فقط."
+ return [
+ {
+ "role": "user",
+ "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+ }
+ ]
+
+
+def post_process(response):
+ # Extract the label from the response
+ if "output" in response:
+ label = response["output"].strip().lower()
+
+ print("label: ", label)
+
+ if "غير ضارة" in label or "لا" in label:
+ return "0"
+ elif label == "ضارة" or "ضارة" in label or "نعم" in label:
+ return "1"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_English.py
new file mode 100755
index 00000000..2495bdab
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_English.py
@@ -0,0 +1,71 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Found chatgpt confused when using 0 and 1 in the prompt
+ label = "not_harmful" if example["label"] == "0" else "harmful"
+ out_prompt = (
+ out_prompt + "tweet: " + example["input"] + "\nlabel: " + label + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+ # print("=========== FS Prompt =============\n")
+ # print(out_prompt)
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = "Classify the following tweet as 'harmful' or 'not_harmful'. Provide only label."
+ return [
+ {
+ "role": "user",
+ "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+ }
+ ]
+
+
+def post_process(response):
+ # Extract the label from the response
+ if "output" in response:
+ label = response["output"].strip().lower()
+
+ print("label: ", label)
+
+ if "not_harmful" in label:
+ return "0"
+ elif label == "harmful":
+ return "1"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Mixed.py
new file mode 100755
index 00000000..cb1587b5
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Mixed.py
@@ -0,0 +1,66 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 3,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ for example in examples:
+ # Translate labels to Arabic
+ label = "not_harmful" if example["label"] == "0" else "harmful"
+ out_prompt += "التغريدة: " + example["input"] + "\nالتصنيف: " + label + "\n\n"
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt += "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = (
+ "صنف التغريدة التالية على أنها 'harmful' أو 'not_harmful'. قدم التصنيف فقط.\n\n"
+ )
+ return [
+ {
+ "role": "user",
+ "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+ }
+ ]
+
+
+def post_process(response):
+ # Extract the label from the response
+ if "output" in response:
+ label = response["output"].strip().lower()
+
+ print("label: ", label)
+ if "لا أستطيع تقديم هذا النوع من الإجابات." in label:
+ return None
+ elif "غير" in label or "ليس" in label or "not" in label:
+ return "0"
+ else:
+ return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..90d4fd20
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Arabic.py
@@ -0,0 +1,57 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ return [
+ {
+ "role": "user",
+ "content": (
+ "صنف التغريدة التالية على أنها 'ضارة' أو 'غير ضارة'. قدم التصنيف فقط.\n\n"
+ + "تغريدة: "
+ + input_sample
+ + "\التصنيف: "
+ ),
+ }
+ ]
+
+
+def post_process(response):
+ # Extract the label from the response
+ if "output" in response:
+ label = response["output"].strip().lower()
+
+ print("label: ", label)
+
+ if "غير ضارة" in label or "لا" in label:
+ return "0"
+ elif label == "ضارة" or "ضارة" in label or "نعم" in label:
+ return "1"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_English.py
new file mode 100755
index 00000000..e2d8107d
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_English.py
@@ -0,0 +1,57 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ return [
+ {
+ "role": "user",
+ "content": (
+ "Classify the following tweet as 'harmful' or 'not_harmful'. Provide only label.\n\n"
+ + "tweet: "
+ + input_sample
+ + "\nlabel: "
+ ),
+ }
+ ]
+
+
+def post_process(response):
+ # Extract the label from the response
+ if "output" in response:
+ label = response["output"].strip().lower()
+
+ print("label: ", label)
+
+ if "not_harmful" in label:
+ return "0"
+ elif label == "harmful":
+ return "1"
+
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..130505ee
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Mixed.py
@@ -0,0 +1,55 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+ return {
+ "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+ "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+ "model": "Llama-3.1-8B-Instruct",
+ "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+ }
+
+
+def config():
+ return {
+ "dataset": CT22HarmfulDataset,
+ "task": HarmfulDetectionTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ return [
+ {
+ "role": "user",
+ "content": (
+ "صنف التغريدة التالية على أنها 'harmful' أو 'not_harmful'. قدم التصنيف فقط.\n\n"
+ + "تغريدة: "
+ + input_sample
+ + "التصنيف: "
+ ),
+ }
+ ]
+
+
+def post_process(response):
+ # Extract the label from the response
+ if "output" in response:
+ label = response["output"].strip().lower()
+
+ print("label: ", label)
+ if "لا أستطيع تقديم هذا النوع من الإجابات." in label:
+ return None
+ elif "غير" in label or "ليس" in label or "not" in label:
+ return "0"
+ else:
+ return "1"