create pointwise feature set then shuffle and split

iesl · Mar 23, 2023 · 1f2037b · Sriharsha-hatwar · Mar 24, 2023 · 1f2037b
1 parent b0679fc
commit 1f2037b
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 88 deletions.
diff --git a/e2e_scripts/preprocess_s2and_data.py b/e2e_scripts/preprocess_s2and_data.py
@@ -16,14 +16,14 @@
 from s2and.data import ANDData
 import logging
 from s2and.featurizer import FeaturizationInfo, featurize
-from preprocess_s2and_pointwise import save_pickled_pointwise_features
+from preprocess_s2and_pointwise import save_pickled_pointwise_features, create_signature_features_matrix
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
-def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed):
+def save_featurized_data(data_home_dir, dataset_name, random_seed, point_features_mat, le_signatures):
     parent_dir = f"{data_home_dir}/{dataset_name}"
     AND_dataset = ANDData(
         signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
@@ -43,6 +43,7 @@ def save_blockwise_featurized_data(data_home_dir, dataset_name, random_seed):
     # Load the featurizer, which calculates pairwise similarity scores
     featurization_info = FeaturizationInfo()
     # the cache will make it faster to train multiple times - it stores the features on disk for you
+    save_pickled_pointwise_features(AND_dataset, point_features_mat, le_signatures, random_seed)
     train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset,
                                                             featurization_info,
                                                             n_jobs=16,
@@ -65,8 +66,8 @@ def find_total_num_train_pairs(blockwise_data):
     for block_id in blockwise_data.keys():
         count += len(blockwise_data[block_id][0])
 
-    print("Total num of signature pairs", count)
-
+    print("Total num of signature pairs", count)    
+    
 # def verify_diff_with_s2and(dataset_name, random_seed):
 #     parent_dir = f"{DATA_HOME_DIR}/{dataset_name}"
 #     AND_dataset = ANDData(
@@ -105,7 +106,6 @@ def find_total_num_train_pairs(blockwise_data):
 #
 #     print("VERIFICATION STATUS: ", s2and_set==our_set)
 
-
 if __name__=='__main__':
     # Creates the pickles that store the preprocessed data
     # Read cmd line args
@@ -118,14 +118,13 @@ def find_total_num_train_pairs(blockwise_data):
     params = args.__dict__
     data_home_dir = params["data_home_dir"]
     dataset = params["dataset_name"]
+
+    point_features_mat, le_signatures = create_signature_features_matrix(data_home_dir, dataset)
 
     random_seeds = {1, 2, 3, 4, 5}
     for seed in random_seeds:
         print("Preprocessing started for seed value", seed)
-        # Create the AND Dataset for the particular seed. (write a function let it be in train_utils.py 
-        # Provide the AND Dataset to the functions : save_pickled_pointwise_features and save_blockwise_featurized_data 
-        #save_blockwise_featurized_data(data_home_dir, dataset, seed)
-        save_pickled_pointwise_features(data_home_dir, dataset, seed)
+        save_featurized_data(data_home_dir, dataset, seed, point_features_mat, le_signatures)
 
 
         # Check the pickles are created OK

diff --git a/e2e_scripts/preprocess_s2and_pointwise.py b/e2e_scripts/preprocess_s2and_pointwise.py
@@ -26,56 +26,71 @@
                     level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-def save_pointwise_in_different_splits(AND_dataset, sparse_matrix, label_encoder_signatures, random_seed):
-    logger.info('extracting signature depending on different split')
-
-    train_block, val_block, test_block = AND_dataset.split_cluster_signatures()
-
-    train_pointwise_features = {}
-    validation_pointwise_features = {}
-    test_pointwise_features = {}
-
-    # The above three should have a key-list(val) (where val is a list of signature IDs) under them. 
-
-    # Doing for training block : 
-    for block_id, list_of_signatures in train_block.items():
-        # Let us transform each of those using label encoder and index them from the sparse matrix.
-        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-        train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
-
-    # Doing for validation block : 
-    for block_id, list_of_signatures in val_block.items():
-        # Let us transform each of those using label encoder and index them from the sparse matrix.
-        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-        validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+def save_pickled_pointwise_features(AND_dataset, sparse_matrix, 
+                                    label_encoder_signatures,
+                                    random_seed: int = None):
+    """
+    Fetch pointwise feature for dataset and store in a pickle.
+    """
 
-    for block_id, list_of_signatures in test_block.items():
-        # Let us transform each of those using label encoder and index them from the sparse matrix.
-        encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
-        test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
-
-    if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")):
-        os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}")
-
-    train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/train_features.pkl"
-    val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/val_features.pkl"
-    test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/pointwise/seed{random_seed}/test_features.pkl"
-
-    with open(train_pkl,"wb") as _pkl_file:
-        pickle.dump(train_pointwise_features, _pkl_file)
-    with open(val_pkl,"wb") as _pkl_file:
-        pickle.dump(validation_pointwise_features, _pkl_file)
-    with open(test_pkl,"wb") as _pkl_file:
-        pickle.dump(test_pointwise_features, _pkl_file)
+    if random_seed:
+        train_block, val_block, test_block = AND_dataset.split_cluster_signatures()
+
+        train_pointwise_features = {}
+        validation_pointwise_features = {}
+        test_pointwise_features = {}
+
+        # The above three should have a key-list(val) (where val is a list of signature IDs) under them. 
+
+        # Doing for training block : 
+        for block_id, list_of_signatures in train_block.items():
+            # Let us transform each of those using label encoder and index them from the sparse matrix.
+            encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+            train_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+
+        # Doing for validation block : 
+        for block_id, list_of_signatures in val_block.items():
+            # Let us transform each of those using label encoder and index them from the sparse matrix.
+            encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+            validation_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+
+        for block_id, list_of_signatures in test_block.items():
+            # Let us transform each of those using label encoder and index them from the sparse matrix.
+            encoded_signature_id_list = label_encoder_signatures.transform(list_of_signatures)
+            test_pointwise_features[block_id] = sparse_matrix[encoded_signature_id_list, :]
+
+        if(not os.path.exists(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")):
+            os.makedirs(f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}")
+
+        train_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/train_signature_features.pkl"
+        val_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/val_signature_features.pkl"
+        test_pkl = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/seed{random_seed}/test_signature_features.pkl"
+
+        with open(train_pkl,"wb") as _pkl_file:
+            pickle.dump(train_pointwise_features, _pkl_file)
+        with open(val_pkl,"wb") as _pkl_file:
+            pickle.dump(validation_pointwise_features, _pkl_file)
+        with open(test_pkl,"wb") as _pkl_file:
+            pickle.dump(test_pointwise_features, _pkl_file)
+    else:
+        processed_data = {}
+        point_features_mat, _ = create_signature_features_matrix(data_home_dir, AND_dataset.name)
+        processed_data['mention_level_features'] = point_features_mat
+
+        logger.info('Dumping processed data')
+        file_name = f"{PREPROCESSED_DATA_DIR}/{AND_dataset.name}/{dataset_name}_all_signature_features.pkl"
+
+        with open(file_name, 'wb') as f:
+            pickle.dump(processed_data, f)
 
 
-def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
+def create_signature_features_matrix(data_home_dir, dataset_name):
     """
-    Fetch pointwise feature for dataset and store in a pickle.
+    Generate pointwise feature set for the entire dataset and return sparse matrix 
+    representation for each signature and their respective features.
     """
-    processed_data = {}
+    logger.info("Signature features pre-procesing started")
     parent_dir = f"{data_home_dir}/{dataset_name}"
-    """
     AND_dataset = ANDData(
         signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
         papers=join(parent_dir, f"{dataset_name}_papers.json"),
@@ -86,33 +101,27 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
         val_pairs_size=10000,
         test_pairs_size=10000,
         name=dataset_name,
-        n_jobs=16,
-        random_seed=random_seed
+        n_jobs=16
     )
 
-    print("Storing pickled dataset....")
-    with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
-        pickle.dump(AND_dataset, f)
-    """
-    # Use below line carefully. 
-    print("Loading pickled dataset...")
-    with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
-        AND_dataset = pickle.load(f)
-    print("Loaded pickle dataset...")
+#     print("Storing pickled dataset....")
+#     with open(f'preprocess_dataset_{dataset_name}.pkl', 'wb') as f:
+#         pickle.dump(AND_dataset, f)
+
+#     # Use below line carefully. 
+#     print("Loading pickled dataset...")
+#     with open(f'preprocess_dataset_{dataset_name}.pkl', 'rb') as f:
+#         AND_dataset = pickle.load(f)
+#     print("Loaded pickle dataset...")
+
+    point_features_mat, le_signatures = pointwise_featurize(AND_dataset,
+                                                          n_jobs=16,
+                                                        use_cache=False)
+
+    logger.info("Signature features pre-procesing completed")
+    return point_features_mat, le_signatures
 
 
-
-    point_features_row, point_features_col,  point_features_data, num_feats, num_points, le_feature_dict = pointwise_featurize(AND_dataset,
-                                                                                                              n_jobs=16,
-                                                                                                            use_cache=False)
-    logger.info('converting feature indices to csr_matrix')
-    point_features = coo_matrix(
-            (point_features_data, (point_features_row, point_features_col)),
-            shape=(num_points, num_feats)
-    ).tocsr()
-    print("Matrix creation done.")
-    save_pointwise_in_different_splits(AND_dataset, point_features, le_feature_dict, random_seed)
-
 if __name__=='__main__':
     # Creates the pickles that store the preprocessed data
     # Read cmd line args
@@ -129,5 +138,4 @@ def save_pickled_pointwise_features(data_home_dir, dataset_name, random_seed):
     random_seed = 1000
 
     print("Preprocessing started")
-    save_pickled_pointwise_features(data_home_dir, dataset, random_seed)
-    print("Matrix")
+    save_pickled_pointwise_features(data_home_dir, dataset)
diff --git a/s2and/featurizer.py b/s2and/featurizer.py
@@ -10,6 +10,7 @@
 from collections import Counter
 from collections.abc import Iterable
 from IPython import embed
+from scipy.sparse import csr_matrix, coo_matrix
 
 from sklearn import preprocessing
 
@@ -839,7 +840,7 @@ def pointwise_featurize(
     chunk_size: int = DEFAULT_CHUNK_SIZE,
 ):
     """
-    Featurizes the input dataset and stores as a unified pickle file. 
+    Extarct Pointwise Features from the dataset. 
 
     Parameters
     ----------
@@ -855,16 +856,15 @@ def pointwise_featurize(
     Returns
     -------
     Returns the three items : 
-    1. Row indices of the sparse matrix containing the data
-    2. Column indices of the sparse matrix containing the data
-    3. The data to be filled in the given row and column combination.
+    1. Sparse matrix poitwise feature representation of all the signatures in a dataset.
+    2. Label encoder to index signature according to their ids
     """
     # Do you think OrderedSet and OrderedDict should be used here? 
     signature_feature_set = set() # The feature is stored a str and not tuple to facilitate label encoding.
     signature_dict = {}
 
     # We dont need to iterate signature per block as we need to create for all the signatures irrespective of the block.
-
+    logger.info('Creating signatures feature set...')
     for signature_key, values in dataset.signatures.items():
         per_signature_features = dataset.signatures[signature_key]._asdict()
         signature_dict[signature_key] = []
@@ -915,7 +915,9 @@ def pointwise_featurize(
                 print('\n!!!! Found another type !!!!\n')
                 embed()
                 exit()
-    logger.info('Label encoding the values')
+    logger.info('Created signatures feature set...')
+
+    logger.info('Label encoding signature features...')
     # Label encoding code --- 
 
     """"
@@ -927,7 +929,7 @@ def pointwise_featurize(
     """
     le_signature_feature_set = preprocessing.LabelEncoder()
     le_signature_feature_set.fit(list(signature_feature_set))
-
+    
     # I am using this for easy retrieval for training, val and test block retrieval. 
     le_signature_dict = preprocessing.LabelEncoder()
     le_signature_dict.fit(list(signature_dict.keys()))
@@ -936,15 +938,22 @@ def pointwise_featurize(
     num_points = len(signature_dict.keys())
     num_feats = len(signature_feature_set)   
 
-    for _, (key, values) in tqdm(enumerate(signature_dict.items()), desc="Converting to sparse matrix"):
+    for key, values in tqdm(signature_dict.items(), desc="Converting to spare matrix"):
         encoded_signature_features = le_signature_feature_set.transform(values)
         encoded_key_val = le_signature_dict.transform([key])[0]
         for feature_label in encoded_signature_features :
             point_features_row.append(encoded_key_val)
             point_features_col.append(feature_label)
             point_features_data.append(1)
-
-    return point_features_row, point_features_col, point_features_data, num_feats, num_points, le_signature_dict
+    logger.info('Label encoding completed...')
+
+    logger.info('converting feature indices to csr_matrix')
+    point_features = coo_matrix(
+            (point_features_data, (point_features_row, point_features_col)),
+            shape=(num_points, num_feats)
+    ).tocsr()
+    print("Matrix creation done.")                      
+    return point_features, le_signature_dict
 
 
 def store_featurized_pickles(