Skip to content

Commit

Permalink
Adding wise24 subjectivity assets (#371)
Browse files Browse the repository at this point in the history
* Adding wise24 subjectivity assets

* Fix pr 372 -- packages versions update (#373)

* Adding wise-spam assets

* updated package version

* updated serntence-transformer version

* updated sentencetransformer version

---------

Co-authored-by: MohamedBayan <[email protected]>

---------

Co-authored-by: Firoj Alam, Scientist, QCRI <[email protected]>
  • Loading branch information
MohamedBayan and firojalam authored Jan 20, 2025
1 parent 5eafe6c commit 800926c
Show file tree
Hide file tree
Showing 18 changed files with 1,130 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from llmebench.datasets import CT23SubjectivityDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import SubjectivityTask


def metadata():
return {
"author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
"affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
"model": "GPT-4o-2024-05-22",
"description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
}


def config():
return {
"dataset": CT23SubjectivityDataset,
"task": SubjectivityTask,
"model": OpenAIModel,
"model_args": {
"class_labels": ["SUBJ", "OBJ"],
"max_tries": 30,
},
"general_args": {"test_split": "ar/dev", "fewshot": {"train_split": "ar"}},
}


def prompt(input_sample, examples):
base_prompt = 'صنف التغريدة الى "موضوعية" أو "غير موضوعية". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.\n'
return [
{
"role": "system",
"content": "أنت خبير في تحليل و تصنيف التغريدات.",
},
{
"role": "user",
"content": few_shot_prompt(input_sample, base_prompt, examples),
},
]


def few_shot_prompt(input_sample, base_prompt, examples):
out_prompt = base_prompt + "\n"
out_prompt += "هنا بعض الأمثلة:\n\n"
for index, example in enumerate(examples):
label = "موضوعية" if example["label"] == "OBJ" else "غير موضوعية"

out_prompt += (
f"مثال {index}:\n" f"التغريدة: {example['input']}\n" f"التصنيف: {label}\n\n"
)

out_prompt += f"التغريدة: {input_sample}\nالتصنيف: \n"

return out_prompt


def post_process(response):
label = response["choices"][0]["message"]["content"].strip().lower()
if "غير" in label or "subj" in label or "not" in label or "ليس" in label:
return "SUBJ"
elif "موضوعية" in label or "obj" in label:
return "OBJ"
else:
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from llmebench.datasets import CT23SubjectivityDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import SubjectivityTask


def metadata():
return {
"author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
"affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
"model": "GPT-4o-2024-05-22",
"description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
}


def config():
return {
"dataset": CT23SubjectivityDataset,
"task": SubjectivityTask,
"model": OpenAIModel,
"model_args": {
"class_labels": ["SUBJ", "OBJ"],
"max_tries": 30,
},
"general_args": {"test_split": "ar/dev", "fewshot": {"train_split": "ar"}},
}


def prompt(input_sample, examples):
base_prompt = (
"Classify the 'tweet' as subjective or objective. Provide only label.\n"
)
return [
{
"role": "system",
"content": "You are an expert in tweet classification and analysis.",
},
{
"role": "user",
"content": few_shot_prompt(input_sample, base_prompt, examples),
},
]


def few_shot_prompt(input_sample, base_prompt, examples):
out_prompt = base_prompt + "\n"
out_prompt = out_prompt + "Here are some examples:\n\n"
for index, example in enumerate(examples):
label = "objective" if example["label"] == "OBJ" else "subjective"

out_prompt = (
out_prompt
+ "Example "
+ str(index)
+ ":"
+ "\n"
+ "tweet: "
+ example["input"]
+ "\nlabel: "
+ label
+ "\n\n"
)

out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"

return out_prompt


def post_process(response):
label = response["choices"][0]["message"]["content"].strip().lower()
if "obj" in label or "موضوعي" in label:
return "OBJ"
elif (
"subj" in label
or "غير" in label
or "لا" in label
or "ذاتي" in label
or "ليس" in label
):
return "SUBJ"
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from llmebench.datasets import CT23SubjectivityDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import SubjectivityTask


def metadata():
return {
"author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
"affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
"model": "GPT-4o-2024-05-22",
"description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
}


def config():
return {
"dataset": CT23SubjectivityDataset,
"task": SubjectivityTask,
"model": OpenAIModel,
"model_args": {
"class_labels": ["SUBJ", "OBJ"],
"max_tries": 30,
},
"general_args": {"test_split": "ar/dev", "fewshot": {"train_split": "ar"}},
}


def prompt(input_sample, examples):
base_prompt = 'صنف التغريدة الى "subjective" أو "objective". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.\n'
return [
{
"role": "system",
"content": "أنت خبير في تحليل و تصنيف التغريدات.",
},
{
"role": "user",
"content": few_shot_prompt(input_sample, base_prompt, examples),
},
]


def few_shot_prompt(input_sample, base_prompt, examples):
out_prompt = base_prompt + "\n"
out_prompt += "هنا بعض الأمثلة:\n\n"
for index, example in enumerate(examples):
label = "objective" if example["label"] == "OBJ" else "subjective"

out_prompt += (
f"مثال {index}:\n" f"التغريدة: {example['input']}\n" f"التصنيف: {label}\n\n"
)

out_prompt += f"التغريدة: {input_sample}\nالتصنيف: \n"

return out_prompt


def post_process(response):
label = response["choices"][0]["message"]["content"].strip().lower()
if "obj" in label or "موضوعي" in label:
return "OBJ"
elif (
"subj" in label
or "غير" in label
or "لا" in label
or "ذاتي" in label
or "ليس" in label
):
return "SUBJ"
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from llmebench.datasets import CT23SubjectivityDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import SubjectivityTask


def metadata():
return {
"author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
"affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
"model": "GPT-4o-2024-05-22",
"description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
}


def config():
return {
"dataset": CT23SubjectivityDataset,
"task": SubjectivityTask,
"model": OpenAIModel,
"general_args": {"test_split": "ar/dev"},
}


def prompt(input_sample):
prompt_string = (
f'صنف التغريدة الى "موضوعية" أو "غير موضوعية". قم بتقديم التصنيف فقط دون أي تبرير إضافي.\n'
f"التغريدة: {input_sample}\n"
f"التصنيف: \n"
)
return [
{
"role": "system",
"content": "أنت خبير في تحليل و تصنيف التغريدات.",
},
{
"role": "user",
"content": prompt_string,
},
]


def post_process(response):
label = response["choices"][0]["message"]["content"].strip().lower()
if "غير" in label or "subj" in label or "not" in label or "ليس" in label:
return "SUBJ"
elif "موضوعية" in label or "obj" in label:
return "OBJ"
else:
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from llmebench.datasets import CT23SubjectivityDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import SubjectivityTask


def metadata():
return {
"author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
"affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
"model": "GPT-4o-2024-05-22",
"description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
}


def config():
return {
"dataset": CT23SubjectivityDataset,
"task": SubjectivityTask,
"model": OpenAIModel,
"model_args": {
"class_labels": ["SUBJ", "OBJ"],
"max_tries": 30,
},
"general_args": {"test_split": "ar/dev"},
}


def prompt(input_sample):
prompt_string = (
f'Classify the tweet as "objective" or "subjective". Provide only the label.\n\n'
f"tweet: {input_sample}\n"
f"label: \n"
)
return [
{
"role": "system",
"content": "You are an expert in tweet classification and analysis.",
},
{
"role": "user",
"content": prompt_string,
},
]


def post_process(response):
label = response["choices"][0]["message"]["content"].strip().lower()
if "obj" in label or "موضوعي" in label:
return "OBJ"
elif (
"subj" in label
or "غير" in label
or "لا" in label
or "ذاتي" in label
or "ليس" in label
):
return "SUBJ"
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from llmebench.datasets import CT23SubjectivityDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import SubjectivityTask


def metadata():
return {
"author": "Mohamed Bayan Kmainasi, Rakif Khan, Ali Ezzat Shahroor, Boushra Bendou, Maram Hasanain, and Firoj Alam",
"affiliation": "Arabic Language Technologies, Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University (HBKU)",
"model": "GPT-4o-2024-05-22",
"description": "For a comprehensive analysis and results, refer to our peer-reviewed publication available at [Springer](https://doi.org/10.1007/978-981-96-0576-7_30) or explore the preprint version on [arXiv](https://arxiv.org/abs/2409.07054).",
}


def config():
return {
"dataset": CT23SubjectivityDataset,
"task": SubjectivityTask,
"model": OpenAIModel,
"general_args": {"test_split": "ar/dev"},
}


def prompt(input_sample):
prompt_string = (
'صنف التغريدة الى "objective" أو "subjective". قم بتقديم التصنيف دون أي تبرير إضافي.\n'
f"التغريدة: {input_sample}\n"
f"التصنيف: \n"
)
return [
{
"role": "system",
"content": "أنت خبير في تحليل و تصنيف التغريدات.",
},
{
"role": "user",
"content": prompt_string,
},
]


def post_process(response):
label = response["choices"][0]["message"]["content"].strip().lower()
if "obj" in label or "موضوعي" in label:
return "OBJ"
elif (
"subj" in label
or "غير" in label
or "لا" in label
or "ذاتي" in label
or "ليس" in label
):
return "SUBJ"
return None
Loading

0 comments on commit 800926c

Please sign in to comment.