diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Arabic.py
new file mode 100755
index 00000000..a115fa49
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Arabic.py
@@ -0,0 +1,87 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "حدد ما إذا كانت المعلومات في الجملة صحيحة أم خاطئة. أجب فقط بـ 'صحيحة' أو 'خاطئة'.\n\n"
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في التحقق من الحقائق.",
+        },
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        label = "صحيحة" if example["label"] == "true" else "خاطئة"
+
+        out_prompt = (
+            out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+
+    if (
+        "true" in input_label
+        or "yes" in input_label
+        or "right" in input_label
+        or "صحيح" in input_label
+        or "صحيحة" in input_label
+        or "نعم" in input_label
+    ):
+        return "true"
+
+    elif (
+        "false" in input_label
+        or "wrong" in input_label
+        or "خطأ" in input_label
+        or "لا" in input_label
+        or "not" in input_label
+        or "خاطئة" in input_label
+        or "خاطئ" in input_label
+    ):
+        return "false"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_English.py
new file mode 100755
index 00000000..2253ecfb
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_English.py
@@ -0,0 +1,87 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "Detect whether the information in the sentence is factually true or false. Answer only by true or false.\n\n"
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "system",
+            "content": "You are an expert fact checker.",
+        },
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        label = example["label"]
+
+        out_prompt = (
+            out_prompt + "sentence: " + sent + "\n" + "label: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "sentence: " + input_sample + "\nlabel: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+
+    if (
+        "true" in input_label
+        or "yes" in input_label
+        or "right" in input_label
+        or "صحيح" in input_label
+        or "صحيحة" in input_label
+        or "نعم" in input_label
+    ):
+        return "true"
+
+    elif (
+        "false" in input_label
+        or "wrong" in input_label
+        or "خطأ" in input_label
+        or "لا" in input_label
+        or "not" in input_label
+        or "خاطئة" in input_label
+        or "خاطئ" in input_label
+    ):
+        return "false"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Mixed.py
new file mode 100755
index 00000000..1655c2a1
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot_Mixed.py
@@ -0,0 +1,87 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "حدد ما إذا كانت المعلومات في الجملة صحيحة أم خاطئة. أجب فقط بـ 'true' أو 'false'.\n\n"
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في التحقق من الحقائق.",
+        },
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        label = example["label"]
+
+        out_prompt = (
+            out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "الجملة: " + input_sample + "التصنيف: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+
+    if (
+        "true" in input_label
+        or "yes" in input_label
+        or "right" in input_label
+        or "صحيح" in input_label
+        or "صحيحة" in input_label
+        or "نعم" in input_label
+    ):
+        return "true"
+
+    elif (
+        "false" in input_label
+        or "wrong" in input_label
+        or "خطأ" in input_label
+        or "لا" in input_label
+        or "not" in input_label
+        or "خاطئة" in input_label
+        or "خاطئ" in input_label
+    ):
+        return "false"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Arabic.py
new file mode 100755
index 00000000..ff1fe4ec
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Arabic.py
@@ -0,0 +1,73 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_text = (
+        "حدد ما إذا كانت المعلومات في الجملة صحيحة أم خاطئة. "
+        "أجب فقط بـ 'صحيحة' أو 'خاطئة'.\n\n"
+        + "الجملة: "
+        + input_sample
+        + "\nالتصنيف: \n"
+    )
+
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في التحقق من الحقائق.",
+        },
+        {
+            "role": "user",
+            "content": prompt_text,
+        },
+    ]
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+
+    if (
+        "true" in input_label
+        or "yes" in input_label
+        or "right" in input_label
+        or "صحيح" in input_label
+        or "صحيحة" in input_label
+        or "نعم" in input_label
+    ):
+        return "true"
+
+    elif (
+        "false" in input_label
+        or "wrong" in input_label
+        or "خطأ" in input_label
+        or "لا" in input_label
+        or "not" in input_label
+        or "خاطئة" in input_label
+        or "خاطئ" in input_label
+    ):
+        return "false"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_English.py
new file mode 100755
index 00000000..fc513431
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_English.py
@@ -0,0 +1,72 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_text = (
+        "Detect whether the information in the sentence is factually true or false. Answer only by true or false.\n\n"
+        + "sentence: "
+        + input_sample
+        + "\nlabel: \n"
+    )
+
+    return [
+        {
+            "role": "system",
+            "content": "You are an expert fact checker.",
+        },
+        {
+            "role": "user",
+            "content": prompt_text,
+        },
+    ]
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+
+    if (
+        "true" in input_label
+        or "yes" in input_label
+        or "right" in input_label
+        or "صحيح" in input_label
+        or "صحيحة" in input_label
+        or "نعم" in input_label
+    ):
+        return "true"
+
+    elif (
+        "false" in input_label
+        or "wrong" in input_label
+        or "خطأ" in input_label
+        or "لا" in input_label
+        or "not" in input_label
+        or "خاطئة" in input_label
+        or "خاطئ" in input_label
+    ):
+        return "false"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Mixed.py
new file mode 100755
index 00000000..6140335a
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot_Mixed.py
@@ -0,0 +1,73 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_text = (
+        "حدد ما إذا كانت المعلومات في الجملة صحيحة أم خاطئة. "
+        "أجب فقط بـ 'true' أو 'false'.\n\n"
+        + "الجملة: "
+        + input_sample
+        + "\nالتصنيف: \n"
+    )
+
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في التحقق من الحقائق.",
+        },
+        {
+            "role": "user",
+            "content": prompt_text,
+        },
+    ]
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+
+    if (
+        "true" in input_label
+        or "yes" in input_label
+        or "right" in input_label
+        or "صحيح" in input_label
+        or "صحيحة" in input_label
+        or "نعم" in input_label
+    ):
+        return "true"
+
+    elif (
+        "false" in input_label
+        or "wrong" in input_label
+        or "خطأ" in input_label
+        or "لا" in input_label
+        or "not" in input_label
+        or "خاطئة" in input_label
+        or "خاطئ" in input_label
+    ):
+        return "false"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Arabic.py
new file mode 100755
index 00000000..4d3b37ee
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Arabic.py
@@ -0,0 +1,89 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'نعم' إذا كانت صحيحة و'لا' إذا لم تكن صحيحة. "
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        if example["label"] == "true":
+            label = "نعم"
+        elif example["label"] == "false":
+            label = "لا"
+        out_prompt = (
+            out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace(".", "").strip().lower()
+    if "آسف" in label or "لا أستطيع" in label:
+        return None
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "label: 0" in label
+        or "label: no" in label
+        or "غير صحيح" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "label: 1" in label
+        or "label: yes" in label
+        or "صحيح" in label
+        or "صح" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_English.py
new file mode 100755
index 00000000..c418520c
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_English.py
@@ -0,0 +1,83 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "Detect whether the information in the sentence is factually true or false. Answer only by true or false.\n\n"
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        label = example["label"]
+
+        out_prompt = (
+            out_prompt + "Sentence: " + sent + "\n" + "label: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "Sentence: " + input_sample + "\nlabel: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace(".", "").strip().lower()
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "label: 0" in label
+        or "label: no" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "yes" in label
+        or "label: 1" in label
+        or "label: yes" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Mixed.py
new file mode 100755
index 00000000..23cdda1e
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_FewShot_Mixed.py
@@ -0,0 +1,84 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'true' إذا كانت صحيحة و'false' إذا لم تكن صحيحة. "
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        if example["label"] == "true":
+            label = "true"
+        elif example["label"] == "false":
+            label = "false"
+        out_prompt = (
+            out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace(".", "").strip().lower()
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "label: 0" in label
+        or "label: no" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "label: 1" in label
+        or "label: yes" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..b002cbee
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Arabic.py
@@ -0,0 +1,67 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_text = (
+        "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'نعم' إذا كانت صحيحة و'لا' إذا لم تكن صحيحة. "
+        + "الجملة: "
+        + input_sample
+        + "\التصنيف: \n"
+    )
+
+    return [
+        {
+            "role": "user",
+            "content": prompt_text,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace(".", "").strip().lower()
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "label: 0" in label
+        or "label: no" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "label: 1" in label
+        or "label: yes" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_English.py
new file mode 100755
index 00000000..986f1274
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_English.py
@@ -0,0 +1,64 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    base_prompt = (
+        "Detect whether the information in the sentence is factually true or false. "
+        "Answer only by true or false.\n\n"
+        + "Sentence: "
+        + input_sample
+        + "\nlabel: \n"
+    )
+
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+
+    if (
+        "true" in input_label
+        or "yes" in input_label
+        or "label: 1" in input_label
+        or "label: yes" in input_label
+    ):
+        pred_label = "true"
+    elif (
+        "false" in input_label
+        or "label: 0" in input_label
+        or "label: no" in input_label
+    ):
+        pred_label = "false"
+    else:
+        print("label problem!! " + input_label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..469ee3eb
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot_Mixed.py
@@ -0,0 +1,67 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_text = (
+        "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'true' إذا كانت صحيحة و'false' إذا لم تكن صحيحة. "
+        + "الجملة: "
+        + input_sample
+        + "\nالتصنيف: \n"
+    )
+
+    return [
+        {
+            "role": "user",
+            "content": prompt_text,
+        }
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace(".", "").strip().lower()
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "label: 0" in label
+        or "label: no" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "label: 1" in label
+        or "label: yes" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Arabic.py
new file mode 100755
index 00000000..e72f2575
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Arabic.py
@@ -0,0 +1,92 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": AzureModel,
+        "model_args": {
+            "max_tries": 100,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'نعم' إذا كانت صحيحة و'لا' إذا لم تكن صحيحة."
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        if example["label"] == "true":
+            label = "نعم"
+        elif example["label"] == "false":
+            label = "لا"
+        out_prompt = (
+            out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+
+    label = label.replace(".", "").strip().lower()
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "label: 0" in label
+        or "label: no" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "label: 1" in label
+        or "label: yes" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_English.py
new file mode 100755
index 00000000..30bced14
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_English.py
@@ -0,0 +1,77 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": AzureModel,
+        "model_args": {
+            "max_tries": 100,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "Detect whether the information in the sentence is factually true or false. Answer only by true or false.\n\n"
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        label = example["label"]
+
+        out_prompt = (
+            out_prompt + "Sentence: " + sent + "\n" + "label: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "Sentence: " + input_sample + "\nlabel: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+
+    label = label.replace(".", "").strip().lower()
+
+    if "true" in label or "label: 1" in label or "label: yes" in label:
+        pred_label = "true"
+    elif "false" in label or "label: 0" in label or "label: no" in label:
+        pred_label = "false"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Mixed.py
new file mode 100755
index 00000000..18a92734
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_FewShot_Mixed.py
@@ -0,0 +1,92 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": AzureModel,
+        "model_args": {
+            "max_tries": 100,
+        },
+    }
+
+
+def prompt(input_sample, examples):
+    prompt_text = "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'true' إذا كانت صحيحة و'false' إذا لم تكن صحيحة. "
+
+    fs_prompt = few_shot_prompt(input_sample, prompt_text, examples)
+    return [
+        {
+            "role": "user",
+            "content": fs_prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt
+    for example in examples:
+        sent = example["input"]
+        if example["label"] == "true":
+            label = "true"
+        elif example["label"] == "false":
+            label = "false"
+        out_prompt = (
+            out_prompt + "الجملة: " + sent + "\n" + "التصنيف: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "الجملة: " + input_sample + "\التصنيف: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+
+    label = label.replace(".", "").strip().lower()
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "label: 0" in label
+        or "label: no" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "label: 1" in label
+        or "label: yes" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..2cd40497
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Arabic.py
@@ -0,0 +1,75 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": AzureModel,
+        "model_args": {
+            "max_tries": 100,
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_text = (
+        "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'نعم' إذا كانت صحيحة و'لا' إذا لم تكن صحيحة. "
+        + "الجملة: "
+        + input_sample
+        + "\التصنيف: \n"
+    )
+
+    return [
+        {
+            "role": "user",
+            "content": prompt_text,
+        },
+    ]
+
+
+def post_process(response):
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+
+    label = label.replace(".", "").strip().lower()
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "خطا" in label
+        or "label: no" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "صح" in label
+        or "label: yes" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_English.py
new file mode 100755
index 00000000..574b5ce9
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_English.py
@@ -0,0 +1,62 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": AzureModel,
+        "model_args": {
+            "max_tries": 100,
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_text = (
+        "Detect whether the information in the sentence is factually true or false. "
+        "Answer only by 'true' or 'false'.\n\n"
+        + "Sentence: "
+        + input_sample
+        + "\nlabel: \n"
+    )
+
+    return [
+        {
+            "role": "user",
+            "content": prompt_text,
+        },
+    ]
+
+
+def post_process(response):
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+
+    label = label.replace(".", "").strip().lower()
+    if "true" in label or "label: 1" in label or "label: yes" in label:
+        pred_label = "true"
+    elif "false" in label or "label: 0" in label or "label: no" in label:
+        pred_label = "false"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..3cd6c987
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Llama3-8b_ZeroShot_Mixed.py
@@ -0,0 +1,75 @@
+from llmebench.datasets import ANSFactualityDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import FactualityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": ANSFactualityDataset,
+        "task": FactualityTask,
+        "model": AzureModel,
+        "model_args": {
+            "max_tries": 100,
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_text = (
+        "هل المعلومات في الجملة التالية صحيحة أم لا؟ أجب فقط بـ 'true' إذا كانت صحيحة و'false' إذا لم تكن صحيحة. "
+        + "الجملة: "
+        + input_sample
+        + "\nالتصنيف: \n"
+    )
+
+    return [
+        {
+            "role": "user",
+            "content": prompt_text,
+        }
+    ]
+
+
+def post_process(response):
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+
+    label = label.replace(".", "").strip().lower()
+
+    if (
+        "لا" in label
+        or "خطأ" in label
+        or "ليست" in label
+        or "false" in label
+        or "label: 0" in label
+        or "label: no" in label
+    ):
+        pred_label = "false"
+    elif (
+        "نعم" in label
+        or "صحيحة" in label
+        or "true" in label
+        or "label: 1" in label
+        or "label: yes" in label
+    ):
+        pred_label = "true"
+    else:
+        print("label problem!! " + label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Arabic.py
new file mode 100755
index 00000000..187661c9
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Arabic.py
@@ -0,0 +1,85 @@
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def prompt(input_sample, examples):
+    base_prompt = "هل قد تتسبب هذه التغريدة بأي ضرر أو أذى لشخص أو مؤسسة؟ إن كانت كذلك، أجب بـ 'ضارة'. أجب بـ 'غير ضارة' إن لم تكن كذلك. قدم فقط التصنيف.\n"
+    prompt = few_shot_prompt(input_sample, base_prompt, examples)
+
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Found chatgpt confused when using 0 and 1 in the prompt
+        label = "غير ضارة" if example["label"] == "0" else "ضارة"
+        out_prompt = (
+            out_prompt + "التغريدة: " + example["input"] + "التصنيف: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "التغريدة: " + input_sample + "التصنيف: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label_fixed = None
+    label = label.replace("label:", "")
+    label = label.replace("التصنيف:", "")
+    label = label.strip().lower()
+
+    if (
+        "غير ضارة" in label
+        or "ليست ضارة" in label
+        or "غير ضاره" in label
+        or "غير" in label
+        or "not" in label
+        or "ليست" in label
+        or "لا" in label
+        or "not harmful" in label
+        or label.startswith("no")
+        or "safe" in label
+        or "not_harmful" in label
+    ):
+        label_fixed = "0"
+    elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+        label_fixed = "1"
+
+    return label_fixed
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_English.py
new file mode 100755
index 00000000..4915b237
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_English.py
@@ -0,0 +1,84 @@
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def prompt(input_sample, examples):
+    base_prompt = "Can this tweet cause harm to any person, organization or entity? Answer only by 'harmful' or 'not harmful'. Provide only label.\n"
+    prompt = few_shot_prompt(input_sample, base_prompt, examples)
+
+    return [
+        {
+            "role": "system",
+            "content": "You are an expert in tweet analysis and classification.",
+        },
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Found chatgpt confused when using 0 and 1 in the prompt
+        label = "harmful" if example["label"] == "1" else "not harmful"
+        out_prompt = (
+            out_prompt + "tweet: " + example["input"] + "\nlabel: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the label blank
+    out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace("label:", "")
+    label = label.replace("التصنيف:", "")
+    label = label.strip().lower()
+
+    if (
+        "غير ضارة" in label
+        or "ليست ضارة" in label
+        or "غير ضاره" in label
+        or "غير" in label
+        or "not" in label
+        or "ليست" in label
+        or "لا" in label
+        or "not harmful" in label
+        or label.startswith("no")
+        or "safe" in label
+        or "not_harmful" in label
+    ):
+        return "0"
+    elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+        return "1"
+
+    return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Mixed.py
new file mode 100755
index 00000000..ffe298b4
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot_Mixed.py
@@ -0,0 +1,85 @@
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def prompt(input_sample, examples):
+    base_prompt = "هل قد تتسبب هذه التغريدة بأي ضرر أو أذى لشخص أو مؤسسة؟ إن كانت كذلك، أجب بـ 'harmful'. أجب بـ 'not harmful' إن لم تكن كذلك. قدم فقط التصنيف.\n"
+    prompt = few_shot_prompt(input_sample, base_prompt, examples)
+
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Found chatgpt confused when using 0 and 1 in the prompt
+        label = "not harmful" if example["label"] == "0" else "harmful"
+        out_prompt = (
+            out_prompt + "التغريدة: " + example["input"] + "التصنيف: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "التغريدة: " + input_sample + "التصنيف: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label_fixed = None
+    label = label.replace("label:", "")
+    label = label.replace("التصنيف:", "")
+    label = label.strip().lower()
+
+    if (
+        "غير ضارة" in label
+        or "ليست ضارة" in label
+        or "غير ضاره" in label
+        or "غير" in label
+        or "not" in label
+        or "ليست" in label
+        or "لا" in label
+        or "not harmful" in label
+        or label.startswith("no")
+        or "safe" in label
+        or "not_harmful" in label
+    ):
+        return "0"
+    elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+        return "1"
+
+    return label_fixed
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Arabic.py
new file mode 100755
index 00000000..0f5e05b0
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Arabic.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f"هل قد تتسبب هذه التغريدة بأي ضرر أو أذى لشخص أو مؤسسة؟ إن كانت كذلك، أجب بـ 'ضارة'. أجب بـ 'غير ضارة' إن لم تكن كذلك. قدم فقط التصنيف.\n\n"
+        f"التغريدة: {input_sample}\n"
+        f"التصنيف: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace("label:", "")
+    label = label.replace("التصنيف:", "")
+    label = label.strip().lower()
+
+    if (
+        "غير ضارة" in label
+        or "ليست ضارة" in label
+        or "غير ضاره" in label
+        or "غير" in label
+        or "not" in label
+        or "ليست" in label
+        or "لا" in label
+        or "not harmful" in label
+        or label.startswith("no")
+        or "safe" in label
+        or "not_harmful" in label
+    ):
+        return "0"
+    elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+        return "1"
+
+    return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_English.py
new file mode 100755
index 00000000..cac11ed7
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_English.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f"Can this tweet cause harm to any person, organization or entity? Answer only by 'harmful' or 'not harmful'. Provide only label.\n\n"
+        f"tweet: {input_sample}\n"
+        f"label: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "You are an expert in tweet analysis and classification.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace("label:", "")
+    label = label.replace("التصنيف:", "")
+    label = label.strip().lower()
+
+    if (
+        "غير ضارة" in label
+        or "ليست ضارة" in label
+        or "غير ضاره" in label
+        or "غير" in label
+        or "not" in label
+        or "ليست" in label
+        or "لا" in label
+        or "not harmful" in label
+        or label.startswith("no")
+        or "safe" in label
+        or "not_harmful" in label
+    ):
+        return "0"
+    elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+        return "1"
+
+    return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Mixed.py
new file mode 100755
index 00000000..13196db3
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot_Mixed.py
@@ -0,0 +1,72 @@
+import re
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f"هل قد تتسبب هذه التغريدة بأي ضرر أو أذى لشخص أو مؤسسة؟ إن كانت كذلك، أجب بـ 'harmful'. أجب بـ 'not harmful' إن لم تكن كذلك. قدم فقط التصنيف.\n\n"
+        f"التغريدة: {input_sample}\n"
+        f"التصنيف: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label_fixed = None
+    label = label.replace("label:", "")
+    label = label.replace("التصنيف:", "")
+    label = label.strip().lower()
+
+    if (
+        "غير ضارة" in label
+        or "ليست ضارة" in label
+        or "غير ضاره" in label
+        or "غير" in label
+        or "not" in label
+        or "ليست" in label
+        or "لا" in label
+        or "not harmful" in label
+        or label.startswith("no")
+        or "safe" in label
+        or "not_harmful" in label
+    ):
+        return "0"
+    elif "ضارة" in label or "harmful" in label or "نعم" in label or "yes" in label:
+        return "1"
+
+    return label_fixed
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Arabic.py
new file mode 100755
index 00000000..9e9be164
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Arabic.py
@@ -0,0 +1,67 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Translate labels to Arabic
+        label = "غير ضارة" if example["label"] == "0" else "ضارة"
+        out_prompt += "التغريدة: " + example["input"] + "\nالتصنيف: " + label + "\n\n"
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt += "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = "صنف التغريدة التالية على أنها 'ضارة' أو 'غير ضارة'. قدم التصنيف فقط."
+    return [
+        {
+            "role": "user",
+            "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+        }
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].lower()
+    print("label: ", label)
+    if "لا يمكنني" in label or "ai" in label:
+        return None
+    if (
+        "غير ضارة" in label
+        or "لا يحتوي على محتوى ضار" in label
+        or "غير" in label
+        or "ليست" in label
+    ):
+        return "0"
+
+    else:
+        return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_English.py
new file mode 100755
index 00000000..c26b458a
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_English.py
@@ -0,0 +1,75 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Found chatgpt confused when using 0 and 1 in the prompt
+        label = "not_harmful" if example["label"] == "0" else "harmful"
+        out_prompt = (
+            out_prompt + "tweet: " + example["input"] + "\nlabel: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = "Classify the following tweet as 'harmful' or 'not_harmful'. Provide only label."
+    return [
+        {
+            "role": "user",
+            "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+        }
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].lower()
+    print("label: ", label)
+    if "label: harmful" in label or "is harmful" in label:
+        return "1"
+    if "لا يمكنني" in label or "ai" in label or "لا يمكن" in label:
+        return None
+    if (
+        "غير ضارة" in label
+        or "لا يحتوي على محتوى ضار" in label
+        or "غير" in label
+        or "ليست" in label
+        or "not" in label
+    ):
+        return "0"
+
+    else:
+        return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Mixed.py
new file mode 100755
index 00000000..9fe4d0e3
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_FewShot_Mixed.py
@@ -0,0 +1,89 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Translate labels to Arabic
+        label = "not_harmful" if example["label"] == "0" else "harmful"
+        out_prompt += "التغريدة: " + example["input"] + "\nالتصنيف: " + label + "\n\n"
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt += "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = (
+        "صنف التغريدة التالية على أنها 'harmful' أو 'not_harmful'. قدم التصنيف فقط.\n\n"
+    )
+    return [
+        {
+            "role": "user",
+            "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+        }
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].lower()
+    print("label: ", label)
+    if (
+        "لا يمكنني" in label
+        or "ai" in label
+        or "لا يمكن" in label
+        or "ما هي المعايير" in label
+    ):
+        return None
+    if "التغريدة الأخيرة هي 'harmful'" in label:
+        return "1"
+    if "التغريدة الثالثة مصنفة على أنها 'غير مؤذية'" in label:
+        return "0"
+    if (
+        "التغريدة الثالثة مصنفة على أنها 'مؤذية'" in label
+        or "غير آمن" in label
+        or "لأنها لا تحتوي على محتوى ضار" in label
+    ):
+        return "1"
+
+    if "label: harmful" in label or "is harmful" in label:
+        return "1"
+
+    if (
+        "غير ضارة" in label
+        or "لا يحتوي على محتوى ضار" in label
+        or "غير" in label
+        or "ليس" in label
+        or "no" in label
+    ):
+        return "0"
+
+    else:
+        return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..42592efa
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Arabic.py
@@ -0,0 +1,58 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "user",
+            "content": (
+                "صنف التغريدة التالية على أنها 'ضارة' أو 'غير ضارة'. قدم التصنيف فقط.\n\n"
+                + "تغريدة: "
+                + input_sample
+                + "\التصنيف: "
+            ),
+        }
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].lower()
+    print("label: ", label)
+    if "لا يمكنني" in label or "ai" in label:
+        return None
+    if (
+        "غير ضارة" in label
+        or "لا يحتوي على محتوى ضار" in label
+        or "غير" in label
+        or "ليست" in label
+    ):
+        return "0"
+
+    else:
+        return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_English.py
new file mode 100755
index 00000000..273545b3
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_English.py
@@ -0,0 +1,59 @@
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    base_prompt = (
+        f"Classify the following tweet as 'harmful' or 'not_harmful'. Provide only label.\n\n"
+        f"tweet: {input_sample}\n"
+        f"label: \n"
+    )
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].lower()
+    print("label: ", label)
+    if "label: harmful" in label or "is harmful" in label:
+        return "1"
+    if "لا يمكنني" in label or "ai" in label or "لا يمكن" in label:
+        return None
+    if (
+        "غير ضارة" in label
+        or "لا يحتوي على محتوى ضار" in label
+        or "غير" in label
+        or "ليست" in label
+        or "not" in label
+    ):
+        return "0"
+
+    else:
+        return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..8fd926a5
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot_Mixed.py
@@ -0,0 +1,59 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "jais-13b-chat",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "user",
+            "content": (
+                "صنف التغريدة التالية على أنها 'harmful' أو 'not_harmful'. قدم التصنيف فقط.\n\n"
+                + "تغريدة: "
+                + input_sample
+                + "التصنيف: "
+            ),
+        }
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].lower()
+
+    if "غير" in label or "ليس" in label or "not" in label:
+        return "0"
+    elif (
+        "ضار" in label
+        or "مس" in label
+        or "مؤ" in label
+        or "harm" in label
+        or "مض" in label
+    ):
+        return "1"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Arabic.py
new file mode 100755
index 00000000..26d6281c
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Arabic.py
@@ -0,0 +1,66 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Translate labels to Arabic
+        label = "غير ضارة" if example["label"] == "0" else "ضارة"
+        out_prompt += "التغريدة: " + example["input"] + "\nالتصنيف: " + label + "\n\n"
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt += "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = "صنف التغريدة التالية على أنها 'ضارة' أو 'غير ضارة'. قدم التصنيف فقط."
+    return [
+        {
+            "role": "user",
+            "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+        }
+    ]
+
+
+def post_process(response):
+    # Extract the label from the response
+    if "output" in response:
+        label = response["output"].strip().lower()
+
+    print("label: ", label)
+
+    if "غير ضارة" in label or "لا" in label:
+        return "0"
+    elif label == "ضارة" or "ضارة" in label or "نعم" in label:
+        return "1"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_English.py
new file mode 100755
index 00000000..2495bdab
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_English.py
@@ -0,0 +1,71 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Found chatgpt confused when using 0 and 1 in the prompt
+        label = "not_harmful" if example["label"] == "0" else "harmful"
+        out_prompt = (
+            out_prompt + "tweet: " + example["input"] + "\nlabel: " + label + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+    # print("=========== FS Prompt =============\n")
+    # print(out_prompt)
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = "Classify the following tweet as 'harmful' or 'not_harmful'. Provide only label."
+    return [
+        {
+            "role": "user",
+            "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+        }
+    ]
+
+
+def post_process(response):
+    # Extract the label from the response
+    if "output" in response:
+        label = response["output"].strip().lower()
+
+    print("label: ", label)
+
+    if "not_harmful" in label:
+        return "0"
+    elif label == "harmful":
+        return "1"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Mixed.py
new file mode 100755
index 00000000..cb1587b5
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_FewShot_Mixed.py
@@ -0,0 +1,66 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 3,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    for example in examples:
+        # Translate labels to Arabic
+        label = "not_harmful" if example["label"] == "0" else "harmful"
+        out_prompt += "التغريدة: " + example["input"] + "\nالتصنيف: " + label + "\n\n"
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt += "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = (
+        "صنف التغريدة التالية على أنها 'harmful' أو 'not_harmful'. قدم التصنيف فقط.\n\n"
+    )
+    return [
+        {
+            "role": "user",
+            "content": (few_shot_prompt(input_sample, base_prompt, examples)),
+        }
+    ]
+
+
+def post_process(response):
+    # Extract the label from the response
+    if "output" in response:
+        label = response["output"].strip().lower()
+
+    print("label: ", label)
+    if "لا أستطيع تقديم هذا النوع من الإجابات." in label:
+        return None
+    elif "غير" in label or "ليس" in label or "not" in label:
+        return "0"
+    else:
+        return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..90d4fd20
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Arabic.py
@@ -0,0 +1,57 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "user",
+            "content": (
+                "صنف التغريدة التالية على أنها 'ضارة' أو 'غير ضارة'. قدم التصنيف فقط.\n\n"
+                + "تغريدة: "
+                + input_sample
+                + "\التصنيف: "
+            ),
+        }
+    ]
+
+
+def post_process(response):
+    # Extract the label from the response
+    if "output" in response:
+        label = response["output"].strip().lower()
+
+    print("label: ", label)
+
+    if "غير ضارة" in label or "لا" in label:
+        return "0"
+    elif label == "ضارة" or "ضارة" in label or "نعم" in label:
+        return "1"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_English.py
new file mode 100755
index 00000000..e2d8107d
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_English.py
@@ -0,0 +1,57 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "user",
+            "content": (
+                "Classify the following tweet as 'harmful' or 'not_harmful'. Provide only label.\n\n"
+                + "tweet: "
+                + input_sample
+                + "\nlabel: "
+            ),
+        }
+    ]
+
+
+def post_process(response):
+    # Extract the label from the response
+    if "output" in response:
+        label = response["output"].strip().lower()
+
+    print("label: ", label)
+
+    if "not_harmful" in label:
+        return "0"
+    elif label == "harmful":
+        return "1"
+
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..130505ee
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Llama3-8b_ZeroShot_Mixed.py
@@ -0,0 +1,55 @@
+import random
+
+from llmebench.datasets import CT22HarmfulDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import HarmfulDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "Llama-3.1-8B-Instruct",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT22HarmfulDataset,
+        "task": HarmfulDetectionTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "user",
+            "content": (
+                "صنف التغريدة التالية على أنها 'harmful' أو 'not_harmful'. قدم التصنيف فقط.\n\n"
+                + "تغريدة: "
+                + input_sample
+                + "التصنيف: "
+            ),
+        }
+    ]
+
+
+def post_process(response):
+    # Extract the label from the response
+    if "output" in response:
+        label = response["output"].strip().lower()
+
+    print("label: ", label)
+    if "لا أستطيع تقديم هذا النوع من الإجابات." in label:
+        return None
+    elif "غير" in label or "ليس" in label or "not" in label:
+        return "0"
+    else:
+        return "1"