Initial commit

octanove · Apr 6, 2021 · b5918e0 · b5918e0
commit b5918e0
Show file tree

Hide file tree

Showing 77 changed files with 5,830 additions and 0 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+max-line-length = 150
+# mccabe
+max-complexity = 10
+ignore =
+    # E203  # whitespace before :
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -0,0 +1,36 @@
+name: Python package
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.7, 3.8]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install poetry
+        make install
+    - name: Lint
+      run: |
+        make lint
+    - name: Test
+      run: |
+        make test
+    - name: Type checking
+      run: |
+        make typecheck
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,132 @@
+data/kenlm/*
+!data/kenlm/.gitkeep
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,68 @@
+
+BASE_DIR := $(shell pwd)
+POETRY_RUN := poetry run
+
+TRAIN_ARTIFACTS_DIR := ${BASE_DIR}/log/train_$(shell date +'%Y%m%d')
+TRAIN_CONFIG_PATH := ${BASE_DIR}/config/train.yaml
+EVALUATION_CONFIG_PATH := ${BASE_DIR}/config/evaluation.yaml
+PREDICT_CONFIG_PATH := ${BASE_DIR}/config/predict.yaml
+INTREPRET_CONFIG_PATH := ${BASE_DIR}/config/interpret.yaml
+
+# override configs (defaults No overrides)
+OVERRIDES := 
+
+TENSORBORD_LOG_DIR := ...
+
+install:
+	poetry install
+
+notebook:
+	${POETRY_RUN} jupyter-notebook
+
+train:
+	@echo training
+	mkdir -p ${TRAIN_ARTIFACTS_DIR}
+	${POETRY_RUN} expats train ${TRAIN_CONFIG_PATH} ${TRAIN_ARTIFACTS_DIR} --overrides ${OVERRIDES}
+
+train-debug:
+	IS_DEBUG=true make train TRAIN_CONFIG_PATH=config/train_debug.yaml TRAIN_ARTIFACTS_DIR=log/debug
+
+evaluate:
+	@echo evaluation on pre-trained model
+	${POETRY_RUN} expats evaluate ${EVALUATION_CONFIG_PATH} --overrides ${OVERRIDES}
+
+evaluate-debug:
+	IS_DEBUG=true make evaluate EVALUATION_CONFIG_PATH=config/evaluate_debug.yaml
+
+predict:
+	@echo evaluation on pre-trained model
+	${POETRY_RUN} expats predict ${PREDICT_CONFIG_PATH} ${PREDICT_OUTPUT_PATH} --overrides ${OVERRIDES}
+
+predict-debug:
+	IS_DEBUG=true make predict PREDICT_CONFIG_PATH=config/predict_debug.yaml PREDICT_OUTPUT_PATH=log/debug_predict
+
+interpret:
+	@echo interpreting pre-trained model
+	${POETRY_RUN} expats interpret ${INTREPRET_CONFIG_PATH} --overrides ${OVERRIDES}
+
+interpret-debug:
+	IS_DEBUG=true make interpret INTREPRET_CONFIG_PATH=config/interpret_debug.yaml 
+
+train-then-evaluate:
+	$(eval ARTIFACT_PATH := ${BASE_DIR}/log/$(shell date +'%Y%m%d%H%M%S'))
+	make train TRAIN_CONFIG_PATH=${TRAIN_CONFIG_PATH} TRAIN_ARTIFACTS_DIR=${ARTIFACT_PATH} OVERRIDES='${TRAIN_OVERRIDES}'
+	make evaluate EVALUATION_CONFIG_PATH=${EVALUATION_CONFIG_PATH} OVERRIDES='artifact_path=${ARTIFACT_PATH} ${EVALUATION_OVERRIDES}'
+	rm -rf ${ARTIFACT_PATH}
+
+tensorboard:
+	${POETRY_RUN} tensorboard --logdir ${TENSORBORD_LOG_DIR}
+
+# CI
+lint:
+	${POETRY_RUN} flake8 --show-source --statistics ./expats ./tests
+
+test:
+	${POETRY_RUN} pytest -rf --cov=./expats ./tests
+
+typecheck:
+	@echo currently not support to check types
diff --git a/README.md b/README.md
@@ -0,0 +1,86 @@
+# EXPATS: A Toolkit for Explainable Automated Text Scoring
+
+![EXPATS: A Toolkit for Explainable Automated Text Scoring](overview.png)
+
+EXPATS is an open-source framework for automated text scoring (ATS) tasks, such as automated essay scoring and readability assessment. Users can develop and experiment with different ATS models quickly by using the toolkit's easy-to-use components, the configuration system, and the command-line interface. The toolkit also provides seamless integration with [the Language Interpretability Tool (LIT)](https://pair-code.github.io/lit/) so that one can interpret and visualize models and their predictions. 
+
+## Requirements
+
+- [poetry](https://python-poetry.org/)
+
+## Usage
+
+1. Clone this repository.
+
+```bash
+$ git clone [email protected]:octanove/expats.git
+$ cd expats
+```
+
+2. Install Python dependencies via poetry, and launch an interactive shell
+
+```bash
+$ poetry install
+$ poetry shell
+```
+
+3. Prepare the dataset for your task
+
+We'll use ASAP-AES, a standard dataset for autoamted essay scoring. You can download the dataset from [the Kaggle page](https://www.kaggle.com/c/asap-aes). EXPATS supports a dataset reader for ASAP-AES by default.
+
+4. Write a config file
+
+In the config file, you specify the type of the task (`task`), the type of the profiler (`profiler`) and its hyperparmeters, and the dataset to use (`dataset`). An example config file for training a BERT-based regressor for ASAP-AES is shown below.
+
+```bash
+$ cat config/asap_aes/train_bert.yaml
+task: regression
+
+profiler:
+    type: TransformerRegressor
+    params:
+      trainer:
+        gpus: 1
+        max_epochs: 80
+        accumulate_grad_batches: 2
+      network:
+        output_normalized: true
+        pretrained_model_name_or_path: bert-base-uncased
+        lr: 4e-5
+      data_loader:
+        batch_size: 8
+      val_ratio: 0.2
+      max_length: null
+
+dataset:
+    type: asap-aes
+    params:
+        path: data/asap-aes/training_set_rel3.tsv
+```
+
+5. Train your model
+
+You can train the model by running the `expats train` command as shown below. 
+
+```bash
+$ expats train config/asap_aes/train_bert.yaml artifacts
+```
+
+The result (e.g., log file, the model weights) is stored in the directory `artifacts`.
+
+6. Evalute your model
+
+You can evaluate your model by running:
+
+```bash
+$ expats evaluate config/asap_aes/evaluate.yaml
+```
+
+You can also configure the evaluation settings by modifying the configuration file.
+
+7. Interpret your model
+
+You can launch the LIT server to interpret and visualize the trained model and its behavior:
+```bash
+$ expats interpret config/asap_aes/interpret.yaml
+```
diff --git a/config/asap_aes/evaluate.yaml b/config/asap_aes/evaluate.yaml
@@ -0,0 +1,30 @@
+
+# Please set your artifact path specified when training
+artifact_path: <fill>
+
+dataset:
+    type: asap-aes
+    params:
+        path: /home/manabe_h/workspace/octanove/profiler/data/asap-aes/training_set_rel3.tsv.random11680_rest
+        prompt_id: <fill>
+
+output_convert:
+    regression_to_classification:
+        type: MinMaxDenormalizedRoundNearestInteger
+        params:
+          x_min: <fill>
+          x_max: <fill>
+
+metrics:
+    classification:
+        - type: MacroF1
+          params:
+        - type: MicroF1
+          params:
+        - type: Accuracy
+          params:
+        - type: QuadraticWeightedKappa
+          params:
+    regression:
+        - type: PearsonCorrelation
+          params:
diff --git a/config/asap_aes/train_bert.yaml b/config/asap_aes/train_bert.yaml
@@ -0,0 +1,22 @@
+task: regression
+
+profiler:
+    type: TransformerRegressor
+    params:
+      trainer:
+        gpus: 1
+        max_epochs: 80
+        accumulate_grad_batches: 2
+      network:
+        output_normalized: true
+        pretrained_model_name_or_path: bert-base-uncased
+        lr: 4e-5
+      data_loader:
+        batch_size: 8
+      val_ratio: 0.2
+      max_length: null
+
+dataset:
+    type: asap-aes
+    params:
+        path: data/asap-aes/training_set_rel3.tsv.random11680