conda create -n cmdf python=3.9 numpy matplotlib
conda activate cmdf
pip install numpy
pip install matplotlib
pip install torch==2.0.1+cu117
pip install transformers==4.33.3
pip install tabulate
pip install seqeval
pip install -q gradio
- Paper: https://www.overleaf.com
- Guideline: Link
- Dataset: Link
- Colab: Link
- Checkpoints: Link
Pre-trained model |
---|
WangchanBART-BASE |
WangchanBART-LARGE |
WangchanBART-Large-Finance |
XLM-RoBERTa-base |
XLM-RoBERTa-large |
Language | Documents | Mentions | Tokens |
---|---|---|---|
ALL | 48,376 | 252,904 | 3,871,094 |
TH | 3,280 | 120,274 | 1,364,839 |
EN | 11,428 | 132,630 | 2,506,255 |
Source | Language | Documents | Mentions | Tokens |
---|---|---|---|---|
Pachachat-finance | TH | 740 | 32,390 | 401,737 |
PostToday-finance | TH | 1,000 | 34,408 | 477,372 |
Kaohoon | TH | 1,140 | 40,301 | 370,280 |
Kasikorn | TH | 400 | 13,175 | 146,907 |
Reddit-investing | EN | 5,567 | 71,589 | 1,410,788 |
Reddit-robinhood | EN | 322 | 2,179 | 42,976 |
Reddit-stocks | EN | 5,539 | 58,862 | 1,052,491 |
Items | Score |
---|---|
Full | 0.78 |
Thai | 0.81 |
English | 0.77 |
Model | Precision | Recall | F1(%) |
---|---|---|---|
Full dataset | |||
WangchanBART-BASE | 80.40 | 87.02 | 83.58 |
WangchanBART-LARGE | 81.74 | 88.20 | 84.84 |
WangchanBART-Large-Finance | 78.97 | 85.82 | 82.25 |
XLM-RoBERTa-base | 82.44 | 86.72 | 84.53 |
XLM-RoBERTa-large | 84.00 | 87.70 | 85.81 |
Hyperparameter | Value |
---|---|
Learning rate | 1e-4, 1e-5, 1e-6, 1e-7 |
Dropout | 0.5 |
Seed | 42 |
Batch size | 8 |
python main.py \
--lr 1e-5 \
--epochs 100 \
--mode train \
--batch_size 8 \
--name baseline \
--early_stopping 8 \
--checkpoint_dir ../checkpoints \
--pretrained xlm-roberta-base \
--data_path_train .../train.conll \
--data_path_dev .../dev.conll \
--data_path_test .../test.conll \
--resume None
Output will be saved in a checkpoint directory/{model_name}/
python main.py \
--mode test \
--pretrained xlm-roberta-large \
--resume ...checkpoint/xlm-roberta-large/dir_name1 \
--data_path_test data/toy.conll