From 72d5ad431d15e3437633c49966c9c18619e005d8 Mon Sep 17 00:00:00 2001
From: Marco Gutierrez <a20141676@pucp.pe>
Date: Sun, 15 Oct 2023 04:46:34 -0400
Subject: [PATCH] added intro and overview of pipeline + refactoring for
 ner_sentiment_detection.py to ner_gender_detection.py

---
 1_create_judges_files.py                      |  5 +-
 2_judges_files_stacker.py                     |  2 +-
 3_train_embeddings_w2v.py                     |  2 +-
 README.md                                     | 49 ++++++++++++++++---
 archive/2.5_obtain_words_per_category.py      |  2 +-
 cleaning_tools/corpus_cleaner_funcs.py        | 16 +++---
 ...t_detection.py => ner_gender_detection.py} |  8 +--
 7 files changed, 64 insertions(+), 20 deletions(-)
 rename cleaning_tools/{ner_sentiment_detection.py => ner_gender_detection.py} (98%)

diff --git a/1_create_judges_files.py b/1_create_judges_files.py
index a3ec5c6..ee65443 100644
--- a/1_create_judges_files.py
+++ b/1_create_judges_files.py
@@ -5,7 +5,7 @@
 import sqlite3
 from tqdm import tqdm
 from cleaning_tools.corpus_cleaner_funcs import *
-from cleaning_tools.ner_sentiment_detection import *
+from cleaning_tools.ner_gender_detection import *
 
 maxInt = sys.maxsize
 parameters_json = "./parameters.json"
@@ -226,6 +226,7 @@ def get_judges_list(dataframe, path: str, file_name) -> list:
 if __name__ == "__main__":
     parameters = json.load(open(parameters_json))  
 
+    # setting up main parameters
     local_path = parameters["paths"]["unix_paths"]["out_directory"]
     year = parameters["parameters"]["year"]
     judges_subfolder = local_path + "/judges_dfs"
@@ -236,6 +237,8 @@ def get_judges_list(dataframe, path: str, file_name) -> list:
     conn = sqlite3.connect(local_path + "/judges_database")
 
     # reading stopwords
+    male_pronouns = parameters["male_pronouns"]
+    female_pronouns = parameters["female_pronouns"]
     stopwords = nltk.corpus.stopwords.words('spanish')
     stopwords = [word for word in stopwords if word not in male_pronouns and word not in female_pronouns]
     
diff --git a/2_judges_files_stacker.py b/2_judges_files_stacker.py
index 6435177..249490f 100644
--- a/2_judges_files_stacker.py
+++ b/2_judges_files_stacker.py
@@ -4,7 +4,7 @@
 import tqdm, json
 from tqdm import tqdm
 from cleaning_tools.corpus_cleaner_funcs import *
-from cleaning_tools.ner_sentiment_detection import *
+from cleaning_tools.ner_gender_detection import *
 from create_judges_files import *
 
 maxInt = sys.maxsize
diff --git a/3_train_embeddings_w2v.py b/3_train_embeddings_w2v.py
index 4136ae6..1026b4a 100644
--- a/3_train_embeddings_w2v.py
+++ b/3_train_embeddings_w2v.py
@@ -3,7 +3,7 @@
 import tqdm, json, sys
 from tqdm import tqdm
 from cleaning_tools.corpus_cleaner_funcs import *
-from cleaning_tools.ner_sentiment_detection import *
+from cleaning_tools.ner_gender_detection import *
 import pingouin as pg
 from create_judges_files import *
 from sklearn.utils import resample
diff --git a/README.md b/README.md
index 14323da..c5fe51a 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,34 @@
+# Measuring Biases in judges using textual data
+The idea of this project was to obtain a measure of gender bias using publicly available data from [judicial cases in Peru](https://cej.pj.gob.pe/cej/forms/busquedaform.html) and word embeddings.
 
+To achieve this goal, the pipeline of this repo did the following
 
-### Parameters
+1. Create a parsed Corpus for each judge found on the sample for all the available years
+2. Create word embeddings for each judge using this data and Word2Vec
+3. Use the word embeddings to obtain two gender slants
+    3.1. Gender Career (Gender vs Career/Household oriented chores)
+    3.2. Gender Moral (Gender vs Good/Bad)
 
-`parameters.json`
+The original idea comes from this [Kenya paper](http://users.nber.org/~dlchen/papers/Kenya_Courts_In_Group_Bias.pdf) which uses Glove instead of Word2Vec and doesn't work with textual data in spanish. As the original scripts didn't tackle those challenges and needed to be improved in terms of abstraction/softcoding/documentation, they were recreated from scratch and the final version is the one of this repository.
 
+The statistics behind the methodology to obtain the slants can be found on page 41. 
+
+## Pipeline Overview
+
+- `1_create_judges_files.py` cleans the scraped data and creates a unique dataset per judge for each available year (e.g. judge_1-2017.pkl, judge_1-2018.pkl, ...)
+- `2_judges_files_stacker.py` stacks these datasets at the judge level (e.g. judge_1-2017.pkl and judge_1-2018.pkl is stacked to create judge_1.pkl)
+- `3_train_embeddings_w2v.py` trains w2vec models per judge using the stacked datasets created by `2_judges_files_stacker.py` and the `word_dimension_tokens` from `parameters.json` (explained on next section)
+
+The functions to perform each of these tasks are defined on the respective `.py` files. However, there are subroutines used across the three files from the pipeline. These ones are stored in `cleaning_scripts/corpus_cleaner_funcs.py` or `cleaning_scripts/ner_gender_detection.py` depending on their functionality.
+
+Note: The input comes from a private PSQL database where the data from this website was scraped.
+
+## Deployment
+
+- Create an environment to run the scripts
+    - If you are using anaconda, install it using `environment.yml`
+    - Else, create the environment using `runtime.txt` to find the right Python version and `requirements.txt` for the packages
+- Create the file `parameters.json` on the folder where you'll run either the `.py` or `.sh`` scripts
 ```json
 {
 "paths": {
@@ -13,8 +38,20 @@
         "year": "2018",
         "embeddings_file": "/burg/sscc/projects/data_text_iat/judges_data/SBW-vectors-300-min5.txt"
     },
-"": {
-	""
-    },
+"male_pronouns": ["él", "él mismo", "suyo", "sí", "consigo", "ese", "ese mismo", "aquel", "aquel mismo", "este", "este mismo", "esto", "aquello", "aquello mismo", "otro", "otro mismo", "alguno", "alguno mismo", "ninguno", "ninguno mismo", "varios", "varios mismos", "pocos", "pocos mismos", "muchos", "muchos mismos", "unos", "unos mismos", "mío", "tuyo", "nuestro", "vuestro", "cuyo", "cuántos", "cuánto", "cuantos", "cuanto", "todo", "tanto", "poco", "demasiado", "algunos", "todos", "tantos", "demasiados", "otros", "nosotros", "vosotros", "ellos", "el", "los", "míos", "tuyos", "nuestros", "vuestros", "suyos", "el que", "el cual", "los que", "los cuales", "cuyos", "mucho", "otro más", "cualquiera", "ambos", "sendos", "uno"],
+"female_pronouns": ["nosotras", "vosotras", "ellas", "la", "las", "ella", "mía", "mías", "tuya", "tuyas", "nuestra", "nuestras", "vuestra", "vuestras", "suya", "suyas", "esta", "esa", "aquella", "la que", "la cual", "cuya", "cuanta", "las que", "las cuales", "cuyas", "cuantas", "cuánta", "cuántas", "alguna", "toda", "tanta", "poca", "demasiada", "otra", "mucha", "ninguna", "algunas", "todas", "tantas", "pocas", "demasiadas", "otras", "muchas", "varias", "otra más", "cualquiera", "ambas", "sendas", "una", "ella misma", "sí", "consigo", "esa misma", "aquella misma", "esta misma", "esto", "otra misma", "alguna misma", "ninguna misma", "varias mismas", "pocas mismas", "muchas mismas", "unas", "unas mismas"],
+"word_dimension_tokens": {
+    "male_names": ["luis", "juan", "carlos","antonio","miguel"],
+    "female_names": ["rosa", "maría","pilar","isabel","ana"],
+    "male": ["sr", "dr", "", "", ""],
+    "female": ["ella", ""],
+    "good": ["gran", "solidaria","sana","prudente","trabajadora", "razonabilidad"],
+    "bad": ["agresor","morosos","mala","perjudicada","victima"],
+    "career": ["pago", "trabajador","obreros", "empleadores", "obrero"],
+    "family": ["familia","hijos","hija","padre", "madre"]
+},
 }
-```
\ No newline at end of file
+```
+- Run the scripts according to the order indicated at the beginning of the file
+    - After running `1_create_judges_files.py` and `2_judges_files_stacker.py`, Update the entries `male_pronouns`, `female_pronouns`, `word_dimension_tokens` according to your data
+    - An automated version of this classification was attempted in `archive/2.5_obtain_words_per_category.py` using a BERT model trained with spanish data. The results were highly inaccurate so the script was discarded.
diff --git a/archive/2.5_obtain_words_per_category.py b/archive/2.5_obtain_words_per_category.py
index 70aaae7..0686b1e 100644
--- a/archive/2.5_obtain_words_per_category.py
+++ b/archive/2.5_obtain_words_per_category.py
@@ -5,7 +5,7 @@
 from collections import Counter
 from tqdm import tqdm
 from cleaning_tools.corpus_cleaner_funcs import *
-from cleaning_tools.ner_sentiment_detection import *
+from cleaning_tools.ner_gender_detection import *
 from create_judges_files import *
 
 maxInt = sys.maxsize
diff --git a/cleaning_tools/corpus_cleaner_funcs.py b/cleaning_tools/corpus_cleaner_funcs.py
index 03c20e4..a577a79 100644
--- a/cleaning_tools/corpus_cleaner_funcs.py
+++ b/cleaning_tools/corpus_cleaner_funcs.py
@@ -1,17 +1,16 @@
-#!/usr/bin/env python
-# coding: utf-8
-
 import pandas as pd
-import sys, csv, tqdm, spacy, nltk, pickle, re, logging
+import tqdm, spacy, nltk, pickle, re, logging
 from sklearn.utils import resample
-from multiprocessing import Pool
 
 logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',
                     level=logging.INFO)
                   
                     
-def spanish_cleaner(txt_file):
-    
+def spanish_cleaner(txt_file: str) -> list:
+    """
+    Cleans the inputted text following spanish grammatical rules
+    """
+
     text = txt_file
     text = re.sub(r"(&[a-zA-Z]*;)", " ", text)  # the txt files had some unwanted text like &rsquo; this line removes such text
     text = text.lower()
@@ -80,6 +79,7 @@ def spanish_sentence_cleaner(df, column_data: str) -> list:
     Cleaning involves the removal of punctuation, stop words and 
     encoding characters
     """
+
     logging.in1("--Sencence cleaner--")
     text_data = list(df[column_data]) # working with text data column
 
@@ -107,6 +107,7 @@ def top_words(cleaned_sentences: list, num_common_words=50000) -> list:
     Obtaining a list with the N most common words in
     current text
     """
+
     logging.info("--Top words in text--")
 
     word_list = [] # list with all words
@@ -125,6 +126,7 @@ def creating_train_sample(cleaned_sentences: list, top_n_words: list) -> list:
     Creates a bootstraping sample with sentences that contain at least
     one word from the most frequent ones
     """
+
     logging.info("--Creating training sample--")
     bootstrap_docs = resample(cleaned_sentences, replace=True, n_samples=len(cleaned_sentences))
     output = []
diff --git a/cleaning_tools/ner_sentiment_detection.py b/cleaning_tools/ner_gender_detection.py
similarity index 98%
rename from cleaning_tools/ner_sentiment_detection.py
rename to cleaning_tools/ner_gender_detection.py
index 4ccde56..3abb53d 100644
--- a/cleaning_tools/ner_sentiment_detection.py
+++ b/cleaning_tools/ner_gender_detection.py
@@ -1,11 +1,12 @@
-import nltk, spacy, scipy
+import nltk, spacy, scipy, json
 import regex as re
-from nltk import pos_tag
 import numpy as np
 import gender_guesser.detector as gender
-from transformers import AutoTokenizer, BertModel
 from collections import Counter
 
+# parameters
+parameters_json = "./parameters.json"
+
 # functions
 def distance_words(word: str, comparison_word: str, model: object) -> float:
     """
@@ -250,6 +251,7 @@ def obtain_final_count(unique_names: list, total_counts: list) -> dict:
     stopwords = nltk.corpus.stopwords.words('spanish')
     gender_detector = gender.Detector()
     nlp = spacy.load("es_core_news_sm")
+    parameters = json.load(open(parameters_json))  
     
     # processing text
     cleaned_text = " ".join(spanish_cleaner(text, stopwords)) # cleaning for stopwords