prepare_liwc.py

## Convert LIWC Dictionary from its original `.dic` format to "long-form" `CSV`
try:
    raw_lines = open("LIWC2015_Dictionary.dic", "r").readlines()[1:]
except FileNotFoundError:
    print("Kindly put `LIWC2015_Dictionary.dic` here, please.")
    import sys

    sys.exit()

import re

whitespace_pattern = re.compile("\t+")
leftAligned_lines = [line.strip() for line in raw_lines]
whitespace_lines = [whitespace_pattern.sub("\t", line) for line in leftAligned_lines]
# find boundary:
for i, line in enumerate(whitespace_lines):
    if line is "%":
        break
# Split lines:
label_lines = whitespace_lines[:i]
dict_lines = whitespace_lines[i + 1 :]
# Make mapping:
labels = {}
for line in label_lines:
    parts = line.split("\t")
    if len(parts) < 2:
        print(f'This line is not long enough: "{line}"')
        continue
    ID, label = parts[:2]
    if not ID.isnumeric():
        print(f'What is "{line}"?')
        continue
    labels[ID] = label[: label.find(" ")].title()
# Write CSV -- no need for `pandas`:
tokens = ["term,category\n"]
for line in dict_lines:
    parts = line.split("\t")
    token = parts[0]
    IDs = parts[1:]
    for ID in IDs:
        tokens.append(token + "," + labels[ID] + "\n")
# Write to file:
open("liwc15.csv", "w").writelines(tokens)