Skip to content

Commit

Permalink
Initial commit for pickling sparse matrix for different splits
Browse files Browse the repository at this point in the history
  • Loading branch information
Sriharsha-hatwar committed Mar 23, 2023
1 parent 64d3d28 commit b0679fc
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 30 deletions.
18 changes: 10 additions & 8 deletions e2e_scripts/preprocess_s2and_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@
from s2and.data import ANDData
import logging
from s2and.featurizer import FeaturizationInfo, featurize
from preprocess_s2and_pointwise import save_pickled_pointwise_features

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)


def save_blockwise_featurized_data(dataset_name, random_seed):
parent_dir = f"{DATA_HOME_DIR}/{dataset_name}"
def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed):
parent_dir = f"{data_home_dir}/{dataset_name}"
AND_dataset = ANDData(
signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
papers=join(parent_dir, f"{dataset_name}_papers.json"),
Expand Down Expand Up @@ -115,21 +116,22 @@ def find_total_num_train_pairs(blockwise_data):
print(args)

params = args.__dict__
DATA_HOME_DIR = params["data_home_dir"]
data_home_dir = params["data_home_dir"]
dataset = params["dataset_name"]

random_seeds = {1, 2, 3, 4, 5}
for seed in random_seeds:
print("Preprocessing started for seed value", seed)
save_blockwise_featurized_data(dataset, seed)
# Create the AND Dataset for the particular seed. (write a function let it be in train_utils.py
# Provide the AND Dataset to the functions : save_pickled_pointwise_features and save_blockwise_featurized_data
#save_blockwise_featurized_data(data_home_dir, dataset, seed)
save_pickled_pointwise_features(data_home_dir, dataset, seed)


# Check the pickles are created OK
train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/train_features.pkl"
val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/val_features.pkl"
test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/test_features.pkl"
blockwise_features = read_blockwise_features(train_pkl)
find_total_num_train_pairs(blockwise_features)
#verify_diff_with_s2and(dataset, seed)



#verify_diff_with_s2and(dataset, seed)
79 changes: 60 additions & 19 deletions e2e_scripts/preprocess_s2and_pointwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from os.path import join
from s2and.data import ANDData
import pickle
import os
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
from utils.parser import Parser
Expand All @@ -25,13 +26,56 @@
level=logging.INFO)
logger = logging.getLogger(__name__)

def save_pickled_pointwise_features(data_home_dir, dataset_name):
def save_pointwise_in_different_splits(AND_dataset, sparse_matrix, label_encoder_signatures, random_seed):
logger.info('extracting signature depending on different split')

train_block, val_block, test_block = AND_dataset.split_cluster_signatures()

train_pointwise_features = {}
validation_pointwise_features = {}
test_pointwise_features = {}

# The above three should have a key-list(val) (where val is a list of signature IDs) under them.

# Doing for training block :
for block_id, list_of_signatures in train_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]

# Doing for validation block :
for block_id, list_of_signatures in val_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]

for block_id, list_of_signatures in test_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]

if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")):
os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")

train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_features.pkl"
val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_features.pkl"
test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_features.pkl"

with open(train_pkl,"wb") as _pkl_file:
pickle.dump(train_pointwise_features, _pkl_file)
with open(val_pkl,"wb") as _pkl_file:
pickle.dump(validation_pointwise_features, _pkl_file)
with open(test_pkl,"wb") as _pkl_file:
pickle.dump(test_pointwise_features, _pkl_file)


def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
"""
Fetch pointwise feature for dataset and store in a pickle.
"""
processed_data = {}
parent_dir = f"{data_home_dir}/{dataset_name}"

"""
AND_dataset = ANDData(
signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
papers=join(parent_dir, f"{dataset_name}_papers.json"),
Expand All @@ -43,21 +87,22 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name):
test_pairs_size=10000,
name=dataset_name,
n_jobs=16,
random_seed=random_seed,
random_seed=random_seed
)
# print("Storing pickled dataset....")
# with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
# pickle.dump(AND_dataset, f)

# print("Loading pickled dataset...")
# with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
# AND_dataset = pickle.load(f)
# print("Loaded pickle dataset...")
print("Storing pickled dataset....")
with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
pickle.dump(AND_dataset, f)
"""
# Use below line carefully.
print("Loading pickled dataset...")
with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
AND_dataset = pickle.load(f)
print("Loaded pickle dataset...")



point_features_row, point_features_col, point_features_data, num_feats, num_points = pointwise_featurize(AND_dataset,
point_features_row, point_features_col, point_features_data, num_feats, num_points, le_feature_dict = pointwise_featurize(AND_dataset,
n_jobs=16,
use_cache=False)
logger.info('converting feature indices to csr_matrix')
Expand All @@ -66,12 +111,7 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name):
shape=(num_points, num_feats)
).tocsr()
print("Matrix creation done.")
processed_data['mention_level_features'] = point_features

logger.info('Dumping processed data')

with open(f'{dataset_name}_feature_processed.pkl', 'wb') as f:
pickle.dump(processed_data, f)
save_pointwise_in_different_splits(AND_dataset, point_features, le_feature_dict, random_seed)

if __name__=='__main__':
# Creates the pickles that store the preprocessed data
Expand All @@ -86,7 +126,8 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name):
params = args.__dict__
data_home_dir = params["data_home_dir"]
dataset = params["dataset_name"]
random_seed = 1000

print("Preprocessing started")
save_pickled_pointwise_features(data_home_dir, dataset)
save_pickled_pointwise_features(data_home_dir, dataset, random_seed)
print("Matrix")
11 changes: 8 additions & 3 deletions s2and/featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,19 +927,24 @@ def pointwise_featurize(
"""
le_signature_feature_set = preprocessing.LabelEncoder()
le_signature_feature_set.fit(list(signature_feature_set))

# I am using this for easy retrieval for training, val and test block retrieval.
le_signature_dict = preprocessing.LabelEncoder()
le_signature_dict.fit(list(signature_dict.keys()))

point_features_row, point_features_col, point_features_data = [], [], []
num_points = len(signature_dict.keys())
num_feats = len(signature_feature_set)

for index, (_, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to spare matrix"):
for _, (key, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to sparse matrix"):
encoded_signature_features = le_signature_feature_set.transform(values)
encoded_key_val = le_signature_dict.transform([key])[0]
for feature_label in encoded_signature_features :
point_features_row.append(index)
point_features_row.append(encoded_key_val)
point_features_col.append(feature_label)
point_features_data.append(1)

return point_features_row, point_features_col, point_features_data, num_feats, num_points
return point_features_row, point_features_col, point_features_data, num_feats, num_points, le_signature_dict


def store_featurized_pickles(
Expand Down

0 comments on commit b0679fc

Please sign in to comment.