moves preprocessors and feature extractors in sub-folder

TobiObeck · Nov 15, 2021 · f283e8d · f283e8d
1 parent 1cd1d8e
commit f283e8d
Show file tree

Hide file tree

Showing 12 changed files with 20 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ In case this throws an error like `permission denied` or sth similar, you might
 chmod -R a+x ./src
 ```
 
-This gives access rights to all users for all files (recursively) in the scr directory.
+This gives access rights to all users for all files (recursively) in the src directory.
 
 ### Pipeline Scripts
 
@@ -107,7 +107,7 @@ Here, `input.csv` is a csv file (ideally the output of `create_labels.py`), whil
 
 The following flags configure which preprocessing steps are applied:
 
-- `-p` or `--punctuation`: A new column *"tweet_no_punctuation"* is created, where all punctuation is removed from the original tweet. (See [punctuation_remover.py](src/preprocessing/punctuation_remover.py) for more details)
+- `-p` or `--punctuation`: A new column *"tweet_no_punctuation"* is created, where all punctuation is removed from the original tweet. (See [punctuation_remover.py](src/preprocessing/preprocessors/punctuation_remover.py) for more details)
 - `-t` or `--tokenize`: Tokenize the given column (can be specified by `--tokenize_input`, default = "tweet"), and create new column with suffix "_tokenized" containing tokenized tweet.
 - `-o` or `--other`: Executes all the other preprocessing steps like the removal of non english records and the removal of unnecessary columns.
 
@@ -146,7 +146,7 @@ Here, `input.csv` is the respective training, validation, or test set file creat
 - `"labels"`: a numpy array containing the target labels for the feature vectors (rows are training examples, only column is the label)
 
 The features to be extracted can be configured with the following optional parameters:
-- `-c` or `--char_length`: Count the number of characters in the "tweet" column of the data frame. (see [src/feature_extraction/character_length.py](src/feature_extraction/character_length.py))
+- `-c` or `--char_length`: Count the number of characters in the "tweet" column of the data frame. (see [`character_length.py`](src/feature_extraction/feature_extractors/character_length.py))
 
 Moreover, the script support importing and exporting fitted feature extractors with the following optional arguments:
 - `-i` or `--import_file`: Load a configured and fitted feature extraction from the given pickle file. Ignore all parameters that configure the features to extract.
@@ -271,7 +271,6 @@ python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\
 2. `launch.json` configuration to attach the editor to the already started debug process.
 
 ```json
-// ...
 "configurations": [
   {            
       "name": "Python: Attach",
@@ -283,7 +282,6 @@ python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\
       }            
   },
 ]
-// ...
 ```
 
 3. Start the attach debug configuration via the VS Code UI ([F5] key or `Run`/`Run and Debug` menu)
diff --git a/src/feature_extraction/extract_features.py b/src/feature_extraction/extract_features.py
@@ -5,13 +5,14 @@
 """
 
 import argparse, csv, pickle
+
 import pandas as pd
 import numpy as np
-from src.feature_extraction import character_length
-from src.feature_extraction.character_length import CharacterLengthFE
-from src.feature_extraction.counter_fe import CounterFE
-from src.feature_extraction.feature_collector import FeatureCollector
-from src.feature_extraction.sentiment_fe import SentimentFE
+
+from src.feature_extraction.feature_extractors.character_length import CharacterLengthFE
+from src.feature_extraction.feature_extractors.counter_fe import CounterFE
+from src.feature_extraction.feature_extractors.feature_collector import FeatureCollector
+from src.feature_extraction.feature_extractors.sentiment_fe import SentimentFE
 from src.util import COLUMN_MENTIONS, COLUMN_PHOTOS, COLUMN_TWEET
 from src.util import COLUMN_LABEL, COLUMN_HASHTAGS , COLUMN_URLS
 from src.util import COLUMN_CASHTAGS, COLUMN_REPLY_TO, COLUMN_TWEET_TOKENIZED

diff --git a/src/feature_extraction/bigrams.py → ..._extraction/feature_extractors/bigrams.py b/src/feature_extraction/bigrams.py → ..._extraction/feature_extractors/bigrams.py
@@ -6,7 +6,7 @@
 
 import ast
 import nltk
-from src.feature_extraction.feature_extractor import FeatureExtractor
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
 
 class BigramFeature(FeatureExtractor):
 

diff --git a/src/feature_extraction/character_length.py → ...on/feature_extractors/character_length.py b/src/feature_extraction/character_length.py → ...on/feature_extractors/character_length.py
@@ -5,7 +5,7 @@
 """
 
 import numpy as np
-from src.feature_extraction.feature_extractor import FeatureExtractor
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
 
 # class for extracting the character-based length as a feature
 class CharacterLengthFE(FeatureExtractor):

diff --git a/src/feature_extraction/counter_fe.py → ...traction/feature_extractors/counter_fe.py b/src/feature_extraction/counter_fe.py → ...traction/feature_extractors/counter_fe.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 import ast
-from src.feature_extraction.feature_extractor import FeatureExtractor
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
 
 class CounterFE(FeatureExtractor):
     """

diff --git a/src/feature_extraction/feature_collector.py → ...n/feature_extractors/feature_collector.py b/src/feature_extraction/feature_collector.py → ...n/feature_extractors/feature_collector.py
@@ -5,7 +5,7 @@
 """
 
 import numpy as np
-from src.feature_extraction.feature_extractor import FeatureExtractor
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
 
 # extend FeatureExtractor for the sake of simplicity
 class FeatureCollector(FeatureExtractor):

diff --git a/src/feature_extraction/feature_extractor.py → ...n/feature_extractors/feature_extractor.py b/src/feature_extraction/feature_extractor.py → ...n/feature_extractors/feature_extractor.py
diff --git a/src/feature_extraction/sentiment_fe.py → ...action/feature_extractors/sentiment_fe.py b/src/feature_extraction/sentiment_fe.py → ...action/feature_extractors/sentiment_fe.py
@@ -4,7 +4,7 @@
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 import numpy as np
 import pandas as pd
-from src.feature_extraction.feature_extractor import FeatureExtractor
+from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor
 
 
 class SentimentFE(FeatureExtractor):

diff --git a/src/preprocessing/preprocessor.py → ...eprocessing/preprocessors/preprocessor.py b/src/preprocessing/preprocessor.py → ...eprocessing/preprocessors/preprocessor.py
diff --git a/src/preprocessing/punctuation_remover.py → ...sing/preprocessors/punctuation_remover.py b/src/preprocessing/punctuation_remover.py → ...sing/preprocessors/punctuation_remover.py
@@ -5,7 +5,7 @@
 """
 
 import string
-from src.preprocessing.preprocessor import Preprocessor
+from src.preprocessing.preprocessors.preprocessor import Preprocessor
 from src.util import COLUMN_TWEET, COLUMN_PUNCTUATION
 
 # removes punctuation from the original tweet

diff --git a/src/preprocessing/tokenizer.py → src/preprocessing/preprocessors/tokenizer.py b/src/preprocessing/tokenizer.py → src/preprocessing/preprocessors/tokenizer.py
@@ -4,7 +4,7 @@
 Tokenize the tweet into individual words.
 """
 
-from src.preprocessing.preprocessor import Preprocessor
+from src.preprocessing.preprocessors.preprocessor import Preprocessor
 import nltk
 
 class Tokenizer(Preprocessor):

diff --git a/src/preprocessing/run_preprocessing.py b/src/preprocessing/run_preprocessing.py
@@ -5,13 +5,14 @@
 """
 
 import argparse, csv, pickle
-from numpy.core.numeric import NaN
+
 import pandas as pd
+
 from sklearn.pipeline import make_pipeline
 from src.preprocessing.preprocessors.column_dropper import ColumnDropper
 from src.preprocessing.preprocessors.non_english_remover import NonEnglishRemover
-from src.preprocessing.punctuation_remover import PunctuationRemover
-from src.preprocessing.tokenizer import Tokenizer
+from src.preprocessing.preprocessors.punctuation_remover import PunctuationRemover
+from src.preprocessing.preprocessors.tokenizer import Tokenizer
 from src.util import COLUMN_TWEET, SUFFIX_TOKENIZED