test_pem.py

import pandas as pd

test_data = {
    "text": [
        "thanks so much for this awesome politeness predictor @tslmy",
        "RT @tslmy: fuck you. This thing is completely garbage.",
    ]
}

want_emolex_cnts_df = {
    "Disgust": {0: 0, 1: 1},
}

want_politelex_cnts_df = {
    "you_direct": {0: 0, 1: 1},
    "gratitude": {0: 1, 1: 0},
    "taboo": {0: 0, 1: 1},
    "praise": {0: 1, 1: 0},
}

from pem import Pem


def test_pem():
    # Set up a fixture:
    pem = Pem(
        liwc_path="",
        estimator_path="english_twitter_politeness_estimator_noLiwc.joblib",
        feature_defn_path="english_twitter_additional_features.pickle",
    )
    pem.df = pd.DataFrame(test_data)
    pem.tokenize().vectorize()

    # Assertions:
    criteria = pem.emolex_cnts_df.sum() > 0
    got = pem.emolex_cnts_df.loc[:, criteria].to_dict()
    assert got == want_emolex_cnts_df

    criteria = pem.politelex_cnts_df.sum() > 0
    got = pem.politelex_cnts_df.loc[:, criteria].to_dict()
    assert got == want_politelex_cnts_df

    labels = pem.predict()
    assert labels[0] == "Neutral"
    assert labels[1] == "Neutral"