Skip to content

Commit

Permalink
create pointwise feature set then shuffle and split
Browse files Browse the repository at this point in the history
  • Loading branch information
arana_umass_edu committed Mar 23, 2023
1 parent b0679fc commit 1f2037b
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 88 deletions.
17 changes: 8 additions & 9 deletions e2e_scripts/preprocess_s2and_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
from s2and.data import ANDData
import logging
from s2and.featurizer import FeaturizationInfo, featurize
from preprocess_s2and_pointwise import save_pickled_pointwise_features
from preprocess_s2and_pointwise import save_pickled_pointwise_features, create_signature_features_matrix

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)


def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed):
def save_featurized_data(data_home_dir, dataset_name, random_seed, point_features_mat, le_signatures):
parent_dir = f"{data_home_dir}/{dataset_name}"
AND_dataset = ANDData(
signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
Expand All @@ -43,6 +43,7 @@ def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed):
# Load the featurizer, which calculates pairwise similarity scores
featurization_info = FeaturizationInfo()
# the cache will make it faster to train multiple times - it stores the features on disk for you
save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed)
train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset,
featurization_info,
n_jobs=16,
Expand All @@ -65,8 +66,8 @@ def find_total_num_train_pairs(blockwise_data):
for block_id in blockwise_data.keys():
count += len(blockwise_data[block_id][0])

print("Total num of signature pairs", count)

print("Total num of signature pairs", count)
# def verify_diff_with_s2and(dataset_name, random_seed):
# parent_dir = f"{DATA_HOME_DIR}/{dataset_name}"
# AND_dataset = ANDData(
Expand Down Expand Up @@ -105,7 +106,6 @@ def find_total_num_train_pairs(blockwise_data):
#
# print("VERIFICATION STATUS: ", s2and_set==our_set)


if __name__=='__main__':
# Creates the pickles that store the preprocessed data
# Read cmd line args
Expand All @@ -118,14 +118,13 @@ def find_total_num_train_pairs(blockwise_data):
params = args.__dict__
data_home_dir = params["data_home_dir"]
dataset = params["dataset_name"]

point_features_mat, le_signatures = create_signature_features_matrix(data_home_dir, dataset)

random_seeds = {1, 2, 3, 4, 5}
for seed in random_seeds:
print("Preprocessing started for seed value", seed)
# Create the AND Dataset for the particular seed. (write a function let it be in train_utils.py
# Provide the AND Dataset to the functions : save_pickled_pointwise_features and save_blockwise_featurized_data
#save_blockwise_featurized_data(data_home_dir, dataset, seed)
save_pickled_pointwise_features(data_home_dir, dataset, seed)
save_featurized_data(data_home_dir, dataset, seed, point_features_mat, le_signatures)


# Check the pickles are created OK
Expand Down
146 changes: 77 additions & 69 deletions e2e_scripts/preprocess_s2and_pointwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,56 +26,71 @@
level=logging.INFO)
logger = logging.getLogger(__name__)

def save_pointwise_in_different_splits(AND_dataset, sparse_matrix, label_encoder_signatures, random_seed):
logger.info('extracting signature depending on different split')

train_block, val_block, test_block = AND_dataset.split_cluster_signatures()

train_pointwise_features = {}
validation_pointwise_features = {}
test_pointwise_features = {}

# The above three should have a key-list(val) (where val is a list of signature IDs) under them.

# Doing for training block :
for block_id, list_of_signatures in train_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]

# Doing for validation block :
for block_id, list_of_signatures in val_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
def save_pickled_pointwise_features(AND_dataset, sparse_matrix,
label_encoder_signatures,
random_seed: int = None):
"""
Fetch pointwise feature for dataset and store in a pickle.
"""

for block_id, list_of_signatures in test_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]

if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")):
os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")

train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_features.pkl"
val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_features.pkl"
test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_features.pkl"

with open(train_pkl,"wb") as _pkl_file:
pickle.dump(train_pointwise_features, _pkl_file)
with open(val_pkl,"wb") as _pkl_file:
pickle.dump(validation_pointwise_features, _pkl_file)
with open(test_pkl,"wb") as _pkl_file:
pickle.dump(test_pointwise_features, _pkl_file)
if random_seed:
train_block, val_block, test_block = AND_dataset.split_cluster_signatures()

train_pointwise_features = {}
validation_pointwise_features = {}
test_pointwise_features = {}

# The above three should have a key-list(val) (where val is a list of signature IDs) under them.

# Doing for training block :
for block_id, list_of_signatures in train_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]

# Doing for validation block :
for block_id, list_of_signatures in val_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]

for block_id, list_of_signatures in test_block.items():
# Let us transform each of those using label encoder and index them from the sparse matrix.
encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]

if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")):
os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")

train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/train_signature_features.pkl"
val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/val_signature_features.pkl"
test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/test_signature_features.pkl"

with open(train_pkl,"wb") as _pkl_file:
pickle.dump(train_pointwise_features, _pkl_file)
with open(val_pkl,"wb") as _pkl_file:
pickle.dump(validation_pointwise_features, _pkl_file)
with open(test_pkl,"wb") as _pkl_file:
pickle.dump(test_pointwise_features, _pkl_file)
else:
processed_data = {}
point_features_mat, _ = create_signature_features_matrix(data_home_dir, AND_dataset.name)
processed_data['mention_level_features'] = point_features_mat

logger.info('Dumping processed data')
file_name = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/{dataset_name}_all_signature_features.pkl"

This comment has been minimized.

Copy link
@Sriharsha-hatwar

Sriharsha-hatwar Mar 24, 2023

Collaborator

@anerirana, I think there needs to be some changes done here.

  1. The dataset_name is undefined and needs to be replaced with AND_dataset.name
  2. Need to revalidate whether this is the folder where we need to store the featurized data.

with open(file_name, 'wb') as f:
pickle.dump(processed_data, f)


def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
def create_signature_features_matrix(data_home_dir, dataset_name):
"""
Fetch pointwise feature for dataset and store in a pickle.
Generate pointwise feature set for the entire dataset and return sparse matrix
representation for each signature and their respective features.
"""
processed_data = {}
logger.info("Signature features pre-procesing started")
parent_dir = f"{data_home_dir}/{dataset_name}"
"""
AND_dataset = ANDData(
signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
papers=join(parent_dir, f"{dataset_name}_papers.json"),
Expand All @@ -86,33 +101,27 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
val_pairs_size=10000,
test_pairs_size=10000,
name=dataset_name,
n_jobs=16,
random_seed=random_seed
n_jobs=16
)

print("Storing pickled dataset....")
with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
pickle.dump(AND_dataset, f)
"""
# Use below line carefully.
print("Loading pickled dataset...")
with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
AND_dataset = pickle.load(f)
print("Loaded pickle dataset...")
# print("Storing pickled dataset....")
# with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
# pickle.dump(AND_dataset, f)

# # Use below line carefully.
# print("Loading pickled dataset...")
# with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
# AND_dataset = pickle.load(f)
# print("Loaded pickle dataset...")

point_features_mat, le_signatures = pointwise_featurize(AND_dataset,
n_jobs=16,
use_cache=False)

logger.info("Signature features pre-procesing completed")
return point_features_mat, le_signatures



point_features_row, point_features_col, point_features_data, num_feats, num_points, le_feature_dict = pointwise_featurize(AND_dataset,
n_jobs=16,
use_cache=False)
logger.info('converting feature indices to csr_matrix')
point_features = coo_matrix(
(point_features_data, (point_features_row, point_features_col)),
shape=(num_points, num_feats)
).tocsr()
print("Matrix creation done.")
save_pointwise_in_different_splits(AND_dataset, point_features, le_feature_dict, random_seed)

if __name__=='__main__':
# Creates the pickles that store the preprocessed data
# Read cmd line args
Expand All @@ -129,5 +138,4 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
random_seed = 1000

print("Preprocessing started")
save_pickled_pointwise_features(data_home_dir, dataset, random_seed)
print("Matrix")
save_pickled_pointwise_features(data_home_dir, dataset)
29 changes: 19 additions & 10 deletions s2and/featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from collections import Counter
from collections.abc import Iterable
from IPython import embed
from scipy.sparse import csr_matrix, coo_matrix

from sklearn import preprocessing

Expand Down Expand Up @@ -839,7 +840,7 @@ def pointwise_featurize(
chunk_size: int = DEFAULT_CHUNK_SIZE,
):
"""
Featurizes the input dataset and stores as a unified pickle file.
Extarct Pointwise Features from the dataset.
Parameters
----------
Expand All @@ -855,16 +856,15 @@ def pointwise_featurize(
Returns
-------
Returns the three items :
1. Row indices of the sparse matrix containing the data
2. Column indices of the sparse matrix containing the data
3. The data to be filled in the given row and column combination.
1. Sparse matrix poitwise feature representation of all the signatures in a dataset.
2. Label encoder to index signature according to their ids
"""
# Do you think OrderedSet and OrderedDict should be used here?
signature_feature_set = set() # The feature is stored a str and not tuple to facilitate label encoding.
signature_dict = {}

# We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block.

logger.info('Creating signatures feature set...')
for signature_key, values in dataset.signatures.items():
per_signature_features = dataset.signatures[signature_key]._asdict()
signature_dict[signature_key] = []
Expand Down Expand Up @@ -915,7 +915,9 @@ def pointwise_featurize(
print('\n!!!! Found another type !!!!\n')
embed()
exit()
logger.info('Label encoding the values')
logger.info('Created signatures feature set...')

logger.info('Label encoding signature features...')
# Label encoding code ---

""""
Expand All @@ -927,7 +929,7 @@ def pointwise_featurize(
"""
le_signature_feature_set = preprocessing.LabelEncoder()
le_signature_feature_set.fit(list(signature_feature_set))

# I am using this for easy retrieval for training, val and test block retrieval.
le_signature_dict = preprocessing.LabelEncoder()
le_signature_dict.fit(list(signature_dict.keys()))
Expand All @@ -936,15 +938,22 @@ def pointwise_featurize(
num_points = len(signature_dict.keys())
num_feats = len(signature_feature_set)

for _, (key, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to sparse matrix"):
for key, values in tqdm(signature_dict.items(), desc="Converting to spare matrix"):
encoded_signature_features = le_signature_feature_set.transform(values)
encoded_key_val = le_signature_dict.transform([key])[0]
for feature_label in encoded_signature_features :
point_features_row.append(encoded_key_val)
point_features_col.append(feature_label)
point_features_data.append(1)

return point_features_row, point_features_col, point_features_data, num_feats, num_points, le_signature_dict
logger.info('Label encoding completed...')

logger.info('converting feature indices to csr_matrix')
point_features = coo_matrix(
(point_features_data, (point_features_row, point_features_col)),
shape=(num_points, num_feats)
).tocsr()
print("Matrix creation done.")
return point_features, le_signature_dict


def store_featurized_pickles(
Expand Down

0 comments on commit 1f2037b

Please sign in to comment.