Adding wise24 subjectivity assets (#371)

* Adding wise24 subjectivity assets * Fix pr 372 -- packages versions update (#373) * Adding wise-spam assets * updated package version * updated serntence-transformer version * updated sentencetransformer version --------- Co-authored-by: MohamedBayan <[email protected]> --------- Co-authored-by: Firoj Alam, Scientist, QCRI <[email protected]>
qcri · Jan 20, 2025 · 800926c · 800926c
1 parent 5eafe6c
commit 800926c
Show file tree

Hide file tree

Showing 18 changed files with 1,130 additions and 0 deletions.
diff --git a/...ality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot_Arabic.py b/...ality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot_Arabic.py
@@ -0,0 +1,64 @@
+from llmebench.datasets import CT23SubjectivityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import SubjectivityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT23SubjectivityDataset,
+        "task": SubjectivityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["SUBJ", "OBJ"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar/dev", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def prompt(input_sample, examples):
+    base_prompt = 'صنف التغريدة الى "موضوعية" أو "غير موضوعية". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.\n'
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل و تصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt += "هنا بعض الأمثلة:\n\n"
+    for index, example in enumerate(examples):
+        label = "موضوعية" if example["label"] == "OBJ" else "غير موضوعية"
+
+        out_prompt += (
+            f"مثال {index}:\n" f"التغريدة: {example['input']}\n" f"التصنيف: {label}\n\n"
+        )
+
+    out_prompt += f"التغريدة: {input_sample}\nالتصنيف: \n"
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].strip().lower()
+    if "غير" in label or "subj" in label or "not" in label or "ليس" in label:
+        return "SUBJ"
+    elif "موضوعية" in label or "obj" in label:
+        return "OBJ"
+    else:
+        return None
diff --git a/...lity_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot_English.py b/...lity_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot_English.py
@@ -0,0 +1,80 @@
+from llmebench.datasets import CT23SubjectivityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import SubjectivityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT23SubjectivityDataset,
+        "task": SubjectivityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["SUBJ", "OBJ"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar/dev", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def prompt(input_sample, examples):
+    base_prompt = (
+        "Classify the 'tweet' as subjective or objective. Provide only label.\n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "You are an expert in tweet classification and analysis.",
+        },
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "Here are some examples:\n\n"
+    for index, example in enumerate(examples):
+        label = "objective" if example["label"] == "OBJ" else "subjective"
+
+        out_prompt = (
+            out_prompt
+            + "Example "
+            + str(index)
+            + ":"
+            + "\n"
+            + "tweet: "
+            + example["input"]
+            + "\nlabel: "
+            + label
+            + "\n\n"
+        )
+
+    out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].strip().lower()
+    if "obj" in label or "موضوعي" in label:
+        return "OBJ"
+    elif (
+        "subj" in label
+        or "غير" in label
+        or "لا" in label
+        or "ذاتي" in label
+        or "ليس" in label
+    ):
+        return "SUBJ"
+    return None
diff --git a/...uality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot_Mixed.py b/...uality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot_Mixed.py
@@ -0,0 +1,69 @@
+from llmebench.datasets import CT23SubjectivityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import SubjectivityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT23SubjectivityDataset,
+        "task": SubjectivityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["SUBJ", "OBJ"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar/dev", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def prompt(input_sample, examples):
+    base_prompt = 'صنف التغريدة الى "subjective" أو "objective". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.\n'
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل و تصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt += "هنا بعض الأمثلة:\n\n"
+    for index, example in enumerate(examples):
+        label = "objective" if example["label"] == "OBJ" else "subjective"
+
+        out_prompt += (
+            f"مثال {index}:\n" f"التغريدة: {example['input']}\n" f"التصنيف: {label}\n\n"
+        )
+
+    out_prompt += f"التغريدة: {input_sample}\nالتصنيف: \n"
+
+    return out_prompt
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].strip().lower()
+    if "obj" in label or "موضوعي" in label:
+        return "OBJ"
+    elif (
+        "subj" in label
+        or "غير" in label
+        or "لا" in label
+        or "ذاتي" in label
+        or "ليس" in label
+    ):
+        return "SUBJ"
+    return None
diff --git a/...lity_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot_Arabic.py b/...lity_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot_Arabic.py
@@ -0,0 +1,49 @@
+from llmebench.datasets import CT23SubjectivityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import SubjectivityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT23SubjectivityDataset,
+        "task": SubjectivityTask,
+        "model": OpenAIModel,
+        "general_args": {"test_split": "ar/dev"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f'صنف التغريدة الى "موضوعية" أو "غير موضوعية". قم بتقديم التصنيف فقط دون أي تبرير إضافي.\n'
+        f"التغريدة: {input_sample}\n"
+        f"التصنيف: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل و تصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].strip().lower()
+    if "غير" in label or "subj" in label or "not" in label or "ليس" in label:
+        return "SUBJ"
+    elif "موضوعية" in label or "obj" in label:
+        return "OBJ"
+    else:
+        return None
diff --git a/...ity_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot_English.py b/...ity_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot_English.py
@@ -0,0 +1,58 @@
+from llmebench.datasets import CT23SubjectivityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import SubjectivityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT23SubjectivityDataset,
+        "task": SubjectivityTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["SUBJ", "OBJ"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar/dev"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f'Classify the tweet as "objective" or "subjective". Provide only the label.\n\n'
+        f"tweet: {input_sample}\n"
+        f"label: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "You are an expert in tweet classification and analysis.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].strip().lower()
+    if "obj" in label or "موضوعي" in label:
+        return "OBJ"
+    elif (
+        "subj" in label
+        or "غير" in label
+        or "لا" in label
+        or "ذاتي" in label
+        or "ليس" in label
+    ):
+        return "SUBJ"
+    return None
diff --git a/...ality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot_Mixed.py b/...ality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot_Mixed.py
@@ -0,0 +1,54 @@
+from llmebench.datasets import CT23SubjectivityDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import SubjectivityTask
+
+
+def metadata():
+    return {
+        "author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
+        "affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
+        "model": "GPT-4o-2024-05-22",
+        "description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
+    }
+
+
+def config():
+    return {
+        "dataset": CT23SubjectivityDataset,
+        "task": SubjectivityTask,
+        "model": OpenAIModel,
+        "general_args": {"test_split": "ar/dev"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        'صنف التغريدة الى "objective" أو "subjective". قم بتقديم التصنيف دون أي تبرير إضافي.\n'
+        f"التغريدة: {input_sample}\n"
+        f"التصنيف: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل و تصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"].strip().lower()
+    if "obj" in label or "موضوعي" in label:
+        return "OBJ"
+    elif (
+        "subj" in label
+        or "غير" in label
+        or "لا" in label
+        or "ذاتي" in label
+        or "ليس" in label
+    ):
+        return "SUBJ"
+    return None