adds test for counting feature

lbechberger · pariyashu · Oct 6, 2021 · Oct 6, 2021 · Oct 6, 2021 · Oct 6, 2021
commit eaf01174de488f7dcf818be7d0211b517b7f8c32
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,40 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {            
+            "name": "Python: Attach",
+            "type": "python",
+            "request": "attach",
+            "connect": {
+              "host": "localhost",
+              "port": 5678
+            }            
+        },
+        {
+            "name": "Python: Module",
+            "type": "python",
+            "request": "launch",
+            "module": "code",
+            "cwd": "${workspaceFolder}",
+        },
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            // "pythonArgs": [
+            //     "-m", 
+            //     "src.feature_extraction.test.feature_extraction_test", 
+            //     "E:\\MyPC\\code\\git\\myforkMLiP\\MLinPractice\\src\\feature_extraction\\test\\feature_extraction_test.py"
+            // ],
+            // "env": {
+            //     "PYTHONPATH": "${workspaceFolder}/code"
+            // }
+        }
+    ]
+}
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ All python scripts and classes for the preprocessing of the input data can be fo
 ### Creating Labels
 
 The script `create_labels.py` assigns labels to the raw data points based on a threshold on a linear combination of the number of likes and retweets. It is executed as follows:
-```python -m code.preprocessing.create_labels path/to/input_dir path/to/output.csv```
+```python -m src.preprocessing.create_labels path/to/input_dir path/to/output.csv```
 Here, `input_dir` is the directory containing the original raw csv files, while `output.csv` is the single csv file where the output will be written.
 The script takes the following optional parameters:
 - `-l` or `--likes_weight` determines the relative weight of the number of likes a tweet has received. Defaults to 1.
@@ -68,7 +68,7 @@ The script takes the following optional parameters:
 ### Classical Preprocessing
 
 The script `run_preprocessing.py` is used to run various preprocessing steps on the raw data, producing additional columns in the csv file. It is executed as follows:
-```python -m code.preprocessing.run_preprocessing path/to/input.csv path/to/output.csv```
+```python -m src.preprocessing.run_preprocessing path/to/input.csv path/to/output.csv```
 Here, `input.csv` is a csv file (ideally the output of `create_labels.py`), while `output.csv` is the csv file where the output will be written.
 The preprocessing steps to take can be configured with the following flags:
 - `-p` or `--punctuation`: A new column "tweet_no_punctuation" is created, where all punctuation is removed from the original tweet. (See `code/preprocessing/punctuation_remover.py` for more details)
@@ -80,7 +80,7 @@ Moreover, the script accepts the following optional parameters:
 ### Splitting the Data Set
 
 The script `split_data.py` splits the overall preprocessed data into training, validation, and test set. It can be invoked as follows:
-```python -m code.preprocessing.split_data path/to/input.csv path/to/output_dir```
+```python -m src.preprocessing.split_data path/to/input.csv path/to/output_dir```
 Here, `input.csv` is the input csv file to split (containing a column "label" with the label information, i.e., `create_labels.py` needs to be run beforehand) and `output_dir` is the directory where three individual csv files `training.csv`, `validation.csv`, and `test.csv` will be stored.
 The script takes the following optional parameters:
 - `-t` or `--test_size` determines the relative size of the test set and defaults to 0.2 (i.e., 20 % of the data).
@@ -93,7 +93,7 @@ The script takes the following optional parameters:
 All python scripts and classes for feature extraction can be found in `code/feature_extraction/`.
 
 The script `extract_features.py` takes care of the overall feature extraction process and can be invoked as follows:
-```python -m code.feature_extraction.extract_features path/to/input.csv path/to/output.pickle```
+```python -m src.feature_extraction.extract_features path/to/input.csv path/to/output.pickle```
 Here, `input.csv` is the respective training, validation, or test set file created by `split_data.py`. The file `output.pickle` will be used to store the results of the feature extraction process, namely a dictionary with the following entries:
 - `"features"`: a numpy array with the raw feature values (rows are training examples, colums are features)
 - `"feature_names"`: a list of feature names for the columns of the numpy array
@@ -112,7 +112,7 @@ All python scripts and classes for dimensionality reduction can be found in `cod
 
 The script `reduce_dimensionality.py` takes care of the overall dimensionality reduction procedure and can be invoked as follows:
 
-```python -m code.dimensionality_reduction.reduce_dimensionality path/to/input.pickle path/to/output.pickle```
+```python -m src.dimensionality_reduction.reduce_dimensionality path/to/input.pickle path/to/output.pickle```
 Here, `input.pickle` is the respective training, validation, or test set file created by `extract_features.py`. 
 The file `output.pickle` will be used to store the results of the dimensionality reduction process, containing `"features"` (which are the selected/projected ones) and `"labels"` (same as in the input file).
 
@@ -132,7 +132,7 @@ All python scripts and classes for classification can be found in `code/classifi
 ### Train and Evaluate a Single Classifier
 
 The script `run_classifier.py` can be used to train and/or evaluate a given classifier. It can be executed as follows:
-```python -m code.classification.run_classifier path/to/input.pickle```
+```python -m src.classification.run_classifier path/to/input.pickle```
 Here, `input.pickle` is a pickle file of the respective data subset, produced by either `extract_features.py` or `reduce_dimensionality.py`. 
 
 By default, this data is used to train a **classifier**, which is specified by one of the following optional arguments:
@@ -166,5 +166,33 @@ All python code for the application demo can be found in `code/application/`.
 
 The script `application.py` provides a simple command line interface, where the user is asked to type in their prospective tweet, which is then analyzed using the trained ML pipeline.
 The script can be invoked as follows:
-```python -m code.application.application path/to/preprocessing.pickle path/to/feature_extraction.pickle path/to/dimensionality_reduction.pickle path/to/classifier.pickle```
+```python -m src.application.application path/to/preprocessing.pickle path/to/feature_extraction.pickle path/to/dimensionality_reduction.pickle path/to/classifier.pickle```
 The four pickle files correspond to the exported versions for the different pipeline steps as created by `run_preprocessing.py`, `extract_features.py`, `reduce_dimensionality.py`, and `run_classifier.py`, respectively, with the `-e` option.
+
+## Debugging in Visual Studio Code
+
+1. Running a file in debug mode configured as waiting, because otherwise it woulk just finish to quickly
+
+```
+python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\feature_extraction_test.py
+```
+
+2. `launch.json` configuration to attach the editor to the already started debug process.
+
+```json
+...
+"configurations": [
+  {            
+      "name": "Python: Attach",
+      "type": "python",
+      "request": "attach",
+      "connect": {
+        "host": "localhost",
+        "port": 5678
+      }            
+  },
+]
+...
+```
+
+3. Start the attach debug configuration via the VS Code UI ([F5] key or `Run`/`Run and Debug` menu)
diff --git a/src/feature_extraction/extract_features.py b/src/feature_extraction/extract_features.py
@@ -39,7 +39,7 @@ def main():
             # character length of original tweet (without any changes)
             features.append(CharacterLength(COLUMN_TWEET))
             features.append(CounterFE(COLUMN_MENTIONS))
-            # features.append(CounterFE(COLUMN_PHOTOS))
+            features.append(CounterFE(COLUMN_PHOTOS))
 
         # create overall FeatureCollector
         feature_collector = FeatureCollector(features)

diff --git a/src/feature_extraction/test/feature_extraction_test.py b/src/feature_extraction/test/feature_extraction_test.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Tests feature extraction
+"""
+
+import csv
+import logging
+import unittest
+import pandas as pd
+import numpy as np
+from src.feature_extraction.counter_fe import CounterFE
+
+class CountFeatureTest(unittest.TestCase):
+
+    def setUp(self):
+        logging.basicConfig()
+        self.log = logging.getLogger("LOG")
+
+        self.tryout_df = pd.read_csv("data/preprocessing/split/training.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
+        self.tryout_df = self.tryout_df.rename(columns={"mentions": "mockcolumn", "photos": "mockphotos"})        
+
+        self.INPUT_COLUMN = "mockcolumn"
+        self.counter_feature = CounterFE(self.INPUT_COLUMN)
+        # self.df = pd.DataFrame({ self.INPUT_COLUMN: [{'screen_name': 'zeebusiness', 'name': 'zee business', 'id': '140798905'}, {'screen_name': 'amishdevgan', 'name': 'amish devgan', 'id': '163817624'}] } )
+
+        self.df = pd.DataFrame()
+        # self.df[self.INPUT_COLUMN] = "['[\"This\", \"row\", \"has\", \"five\", \"elements\"], [\"this\", \"only\", \"thre\"], [\"one\"], []']"
+        self.df[self.INPUT_COLUMN] = [
+            "[{'screen_name': 'zeebusiness', 'name': 'zee business', 'id': '140798905'}, {'screen_name': 'amishdevgan', 'name': 'amish devgan', 'id': '163817624'}]",
+            "[]",
+            "[{'screen_name': 'zeebusiness', 'name': 'zee business', 'id': '140798905'}]"
+            ]
+        print("")
+
+
+    def test_input_columns(self):
+        self.assertEqual(self.counter_feature._input_columns, [self.INPUT_COLUMN])
+
+
+    def test_feature_name(self):
+        self.assertEqual(self.counter_feature.get_feature_name(), self.INPUT_COLUMN + "_count")
+
+
+    def test_counting(self):
+        self.counter_feature.fit(self.df)
+        actual_feature = self.counter_feature.transform(self.df)
+        # actual_feature = self.counter_feature.transform(self.tryout_df)
+        # EXPECTED = np.array(pd.DataFrame({"mockcolumn_count": [5,3,1,0]}))
+        EXPECTED = np.array(pd.DataFrame({"mockcolumn_count": [2,0,1]}))
+
+        # self.log.warning("actual_feature", actual_feature)
+        # self.log.warning("EXPECTED", EXPECTED)
+
+        isEqual = np.array_equal(actual_feature, EXPECTED, equal_nan=False)
+        self.assertTrue(isEqual)
+
+
+if __name__ == '__main__':
+    unittest.main()