Skip to content

Commit

Permalink
added intro and overview of pipeline + refactoring for ner_sentiment_…
Browse files Browse the repository at this point in the history
…detection.py to ner_gender_detection.py
  • Loading branch information
mgutierrezc committed Oct 15, 2023
1 parent 4a563e8 commit 72d5ad4
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 20 deletions.
5 changes: 4 additions & 1 deletion 1_create_judges_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sqlite3
from tqdm import tqdm
from cleaning_tools.corpus_cleaner_funcs import *
from cleaning_tools.ner_sentiment_detection import *
from cleaning_tools.ner_gender_detection import *

maxInt = sys.maxsize
parameters_json = "./parameters.json"
Expand Down Expand Up @@ -226,6 +226,7 @@ def get_judges_list(dataframe, path: str, file_name) -> list:
if __name__ == "__main__":
parameters = json.load(open(parameters_json))

# setting up main parameters
local_path = parameters["paths"]["unix_paths"]["out_directory"]
year = parameters["parameters"]["year"]
judges_subfolder = local_path + "/judges_dfs"
Expand All @@ -236,6 +237,8 @@ def get_judges_list(dataframe, path: str, file_name) -> list:
conn = sqlite3.connect(local_path + "/judges_database")

# reading stopwords
male_pronouns = parameters["male_pronouns"]
female_pronouns = parameters["female_pronouns"]
stopwords = nltk.corpus.stopwords.words('spanish')
stopwords = [word for word in stopwords if word not in male_pronouns and word not in female_pronouns]

Expand Down
2 changes: 1 addition & 1 deletion 2_judges_files_stacker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tqdm, json
from tqdm import tqdm
from cleaning_tools.corpus_cleaner_funcs import *
from cleaning_tools.ner_sentiment_detection import *
from cleaning_tools.ner_gender_detection import *
from create_judges_files import *

maxInt = sys.maxsize
Expand Down
2 changes: 1 addition & 1 deletion 3_train_embeddings_w2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import tqdm, json, sys
from tqdm import tqdm
from cleaning_tools.corpus_cleaner_funcs import *
from cleaning_tools.ner_sentiment_detection import *
from cleaning_tools.ner_gender_detection import *
import pingouin as pg
from create_judges_files import *
from sklearn.utils import resample
Expand Down
49 changes: 43 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,34 @@
# Measuring Biases in judges using textual data
The idea of this project was to obtain a measure of gender bias using publicly available data from [judicial cases in Peru](https://cej.pj.gob.pe/cej/forms/busquedaform.html) and word embeddings.

To achieve this goal, the pipeline of this repo did the following

### Parameters
1. Create a parsed Corpus for each judge found on the sample for all the available years
2. Create word embeddings for each judge using this data and Word2Vec
3. Use the word embeddings to obtain two gender slants
3.1. Gender Career (Gender vs Career/Household oriented chores)
3.2. Gender Moral (Gender vs Good/Bad)

`parameters.json`
The original idea comes from this [Kenya paper](http://users.nber.org/~dlchen/papers/Kenya_Courts_In_Group_Bias.pdf) which uses Glove instead of Word2Vec and doesn't work with textual data in spanish. As the original scripts didn't tackle those challenges and needed to be improved in terms of abstraction/softcoding/documentation, they were recreated from scratch and the final version is the one of this repository.

The statistics behind the methodology to obtain the slants can be found on page 41.

## Pipeline Overview

- `1_create_judges_files.py` cleans the scraped data and creates a unique dataset per judge for each available year (e.g. judge_1-2017.pkl, judge_1-2018.pkl, ...)
- `2_judges_files_stacker.py` stacks these datasets at the judge level (e.g. judge_1-2017.pkl and judge_1-2018.pkl is stacked to create judge_1.pkl)
- `3_train_embeddings_w2v.py` trains w2vec models per judge using the stacked datasets created by `2_judges_files_stacker.py` and the `word_dimension_tokens` from `parameters.json` (explained on next section)

The functions to perform each of these tasks are defined on the respective `.py` files. However, there are subroutines used across the three files from the pipeline. These ones are stored in `cleaning_scripts/corpus_cleaner_funcs.py` or `cleaning_scripts/ner_gender_detection.py` depending on their functionality.

Note: The input comes from a private PSQL database where the data from this website was scraped.

## Deployment

- Create an environment to run the scripts
- If you are using anaconda, install it using `environment.yml`
- Else, create the environment using `runtime.txt` to find the right Python version and `requirements.txt` for the packages
- Create the file `parameters.json` on the folder where you'll run either the `.py` or `.sh`` scripts
```json
{
"paths": {
Expand All @@ -13,8 +38,20 @@
"year": "2018",
"embeddings_file": "/burg/sscc/projects/data_text_iat/judges_data/SBW-vectors-300-min5.txt"
},
"": {
""
},
"male_pronouns": ["él", "él mismo", "suyo", "", "consigo", "ese", "ese mismo", "aquel", "aquel mismo", "este", "este mismo", "esto", "aquello", "aquello mismo", "otro", "otro mismo", "alguno", "alguno mismo", "ninguno", "ninguno mismo", "varios", "varios mismos", "pocos", "pocos mismos", "muchos", "muchos mismos", "unos", "unos mismos", "mío", "tuyo", "nuestro", "vuestro", "cuyo", "cuántos", "cuánto", "cuantos", "cuanto", "todo", "tanto", "poco", "demasiado", "algunos", "todos", "tantos", "demasiados", "otros", "nosotros", "vosotros", "ellos", "el", "los", "míos", "tuyos", "nuestros", "vuestros", "suyos", "el que", "el cual", "los que", "los cuales", "cuyos", "mucho", "otro más", "cualquiera", "ambos", "sendos", "uno"],
"female_pronouns": ["nosotras", "vosotras", "ellas", "la", "las", "ella", "mía", "mías", "tuya", "tuyas", "nuestra", "nuestras", "vuestra", "vuestras", "suya", "suyas", "esta", "esa", "aquella", "la que", "la cual", "cuya", "cuanta", "las que", "las cuales", "cuyas", "cuantas", "cuánta", "cuántas", "alguna", "toda", "tanta", "poca", "demasiada", "otra", "mucha", "ninguna", "algunas", "todas", "tantas", "pocas", "demasiadas", "otras", "muchas", "varias", "otra más", "cualquiera", "ambas", "sendas", "una", "ella misma", "", "consigo", "esa misma", "aquella misma", "esta misma", "esto", "otra misma", "alguna misma", "ninguna misma", "varias mismas", "pocas mismas", "muchas mismas", "unas", "unas mismas"],
"word_dimension_tokens": {
"male_names": ["luis", "juan", "carlos","antonio","miguel"],
"female_names": ["rosa", "maría","pilar","isabel","ana"],
"male": ["sr", "dr", "", "", ""],
"female": ["ella", ""],
"good": ["gran", "solidaria","sana","prudente","trabajadora", "razonabilidad"],
"bad": ["agresor","morosos","mala","perjudicada","victima"],
"career": ["pago", "trabajador","obreros", "empleadores", "obrero"],
"family": ["familia","hijos","hija","padre", "madre"]
},
}
```
```
- Run the scripts according to the order indicated at the beginning of the file
- After running `1_create_judges_files.py` and `2_judges_files_stacker.py`, Update the entries `male_pronouns`, `female_pronouns`, `word_dimension_tokens` according to your data
- An automated version of this classification was attempted in `archive/2.5_obtain_words_per_category.py` using a BERT model trained with spanish data. The results were highly inaccurate so the script was discarded.
2 changes: 1 addition & 1 deletion archive/2.5_obtain_words_per_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections import Counter
from tqdm import tqdm
from cleaning_tools.corpus_cleaner_funcs import *
from cleaning_tools.ner_sentiment_detection import *
from cleaning_tools.ner_gender_detection import *
from create_judges_files import *

maxInt = sys.maxsize
Expand Down
16 changes: 9 additions & 7 deletions cleaning_tools/corpus_cleaner_funcs.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import sys, csv, tqdm, spacy, nltk, pickle, re, logging
import tqdm, spacy, nltk, pickle, re, logging
from sklearn.utils import resample
from multiprocessing import Pool

logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',
level=logging.INFO)


def spanish_cleaner(txt_file):

def spanish_cleaner(txt_file: str) -> list:
"""
Cleans the inputted text following spanish grammatical rules
"""

text = txt_file
text = re.sub(r"(&[a-zA-Z]*;)", " ", text) # the txt files had some unwanted text like ’ this line removes such text
text = text.lower()
Expand Down Expand Up @@ -80,6 +79,7 @@ def spanish_sentence_cleaner(df, column_data: str) -> list:
Cleaning involves the removal of punctuation, stop words and
encoding characters
"""

logging.in1("--Sencence cleaner--")
text_data = list(df[column_data]) # working with text data column

Expand Down Expand Up @@ -107,6 +107,7 @@ def top_words(cleaned_sentences: list, num_common_words=50000) -> list:
Obtaining a list with the N most common words in
current text
"""

logging.info("--Top words in text--")

word_list = [] # list with all words
Expand All @@ -125,6 +126,7 @@ def creating_train_sample(cleaned_sentences: list, top_n_words: list) -> list:
Creates a bootstraping sample with sentences that contain at least
one word from the most frequent ones
"""

logging.info("--Creating training sample--")
bootstrap_docs = resample(cleaned_sentences, replace=True, n_samples=len(cleaned_sentences))
output = []
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import nltk, spacy, scipy
import nltk, spacy, scipy, json
import regex as re
from nltk import pos_tag
import numpy as np
import gender_guesser.detector as gender
from transformers import AutoTokenizer, BertModel
from collections import Counter

# parameters
parameters_json = "./parameters.json"

# functions
def distance_words(word: str, comparison_word: str, model: object) -> float:
"""
Expand Down Expand Up @@ -250,6 +251,7 @@ def obtain_final_count(unique_names: list, total_counts: list) -> dict:
stopwords = nltk.corpus.stopwords.words('spanish')
gender_detector = gender.Detector()
nlp = spacy.load("es_core_news_sm")
parameters = json.load(open(parameters_json))

# processing text
cleaned_text = " ".join(spanish_cleaner(text, stopwords)) # cleaning for stopwords
Expand Down

0 comments on commit 72d5ad4

Please sign in to comment.