diff --git a/README.md b/README.md index 5eca96bd..ea9e9aff 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ In case this throws an error like `permission denied` or sth similar, you might chmod -R a+x ./src ``` -This gives access rights to all users for all files (recursively) in the scr directory. +This gives access rights to all users for all files (recursively) in the src directory. ### Pipeline Scripts @@ -107,7 +107,7 @@ Here, `input.csv` is a csv file (ideally the output of `create_labels.py`), whil The following flags configure which preprocessing steps are applied: -- `-p` or `--punctuation`: A new column *"tweet_no_punctuation"* is created, where all punctuation is removed from the original tweet. (See [punctuation_remover.py](src/preprocessing/punctuation_remover.py) for more details) +- `-p` or `--punctuation`: A new column *"tweet_no_punctuation"* is created, where all punctuation is removed from the original tweet. (See [punctuation_remover.py](src/preprocessing/preprocessors/punctuation_remover.py) for more details) - `-t` or `--tokenize`: Tokenize the given column (can be specified by `--tokenize_input`, default = "tweet"), and create new column with suffix "_tokenized" containing tokenized tweet. - `-o` or `--other`: Executes all the other preprocessing steps like the removal of non english records and the removal of unnecessary columns. @@ -146,7 +146,7 @@ Here, `input.csv` is the respective training, validation, or test set file creat - `"labels"`: a numpy array containing the target labels for the feature vectors (rows are training examples, only column is the label) The features to be extracted can be configured with the following optional parameters: -- `-c` or `--char_length`: Count the number of characters in the "tweet" column of the data frame. (see [src/feature_extraction/character_length.py](src/feature_extraction/character_length.py)) +- `-c` or `--char_length`: Count the number of characters in the "tweet" column of the data frame. (see [`character_length.py`](src/feature_extraction/feature_extractors/character_length.py)) Moreover, the script support importing and exporting fitted feature extractors with the following optional arguments: - `-i` or `--import_file`: Load a configured and fitted feature extraction from the given pickle file. Ignore all parameters that configure the features to extract. @@ -271,7 +271,6 @@ python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\ 2. `launch.json` configuration to attach the editor to the already started debug process. ```json -// ... "configurations": [ { "name": "Python: Attach", @@ -283,7 +282,6 @@ python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\ } }, ] -// ... ``` 3. Start the attach debug configuration via the VS Code UI ([F5] key or `Run`/`Run and Debug` menu) diff --git a/src/feature_extraction/extract_features.py b/src/feature_extraction/extract_features.py index 8a97972b..eee619b5 100644 --- a/src/feature_extraction/extract_features.py +++ b/src/feature_extraction/extract_features.py @@ -5,13 +5,14 @@ """ import argparse, csv, pickle + import pandas as pd import numpy as np -from src.feature_extraction import character_length -from src.feature_extraction.character_length import CharacterLengthFE -from src.feature_extraction.counter_fe import CounterFE -from src.feature_extraction.feature_collector import FeatureCollector -from src.feature_extraction.sentiment_fe import SentimentFE + +from src.feature_extraction.feature_extractors.character_length import CharacterLengthFE +from src.feature_extraction.feature_extractors.counter_fe import CounterFE +from src.feature_extraction.feature_extractors.feature_collector import FeatureCollector +from src.feature_extraction.feature_extractors.sentiment_fe import SentimentFE from src.util import COLUMN_MENTIONS, COLUMN_PHOTOS, COLUMN_TWEET from src.util import COLUMN_LABEL, COLUMN_HASHTAGS , COLUMN_URLS from src.util import COLUMN_CASHTAGS, COLUMN_REPLY_TO, COLUMN_TWEET_TOKENIZED diff --git a/src/feature_extraction/bigrams.py b/src/feature_extraction/feature_extractors/bigrams.py similarity index 85% rename from src/feature_extraction/bigrams.py rename to src/feature_extraction/feature_extractors/bigrams.py index ddd7ad50..0096bccd 100644 --- a/src/feature_extraction/bigrams.py +++ b/src/feature_extraction/feature_extractors/bigrams.py @@ -6,7 +6,7 @@ import ast import nltk -from src.feature_extraction.feature_extractor import FeatureExtractor +from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor class BigramFeature(FeatureExtractor): diff --git a/src/feature_extraction/character_length.py b/src/feature_extraction/feature_extractors/character_length.py similarity index 88% rename from src/feature_extraction/character_length.py rename to src/feature_extraction/feature_extractors/character_length.py index 83f3f7b7..53e8a9e1 100644 --- a/src/feature_extraction/character_length.py +++ b/src/feature_extraction/feature_extractors/character_length.py @@ -5,7 +5,7 @@ """ import numpy as np -from src.feature_extraction.feature_extractor import FeatureExtractor +from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor # class for extracting the character-based length as a feature class CharacterLengthFE(FeatureExtractor): diff --git a/src/feature_extraction/counter_fe.py b/src/feature_extraction/feature_extractors/counter_fe.py similarity index 89% rename from src/feature_extraction/counter_fe.py rename to src/feature_extraction/feature_extractors/counter_fe.py index 25cdc9cb..383fe592 100644 --- a/src/feature_extraction/counter_fe.py +++ b/src/feature_extraction/feature_extractors/counter_fe.py @@ -6,7 +6,7 @@ import numpy as np import ast -from src.feature_extraction.feature_extractor import FeatureExtractor +from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor class CounterFE(FeatureExtractor): """ diff --git a/src/feature_extraction/feature_collector.py b/src/feature_extraction/feature_extractors/feature_collector.py similarity index 94% rename from src/feature_extraction/feature_collector.py rename to src/feature_extraction/feature_extractors/feature_collector.py index f637f3cf..b0c0e985 100644 --- a/src/feature_extraction/feature_collector.py +++ b/src/feature_extraction/feature_extractors/feature_collector.py @@ -5,7 +5,7 @@ """ import numpy as np -from src.feature_extraction.feature_extractor import FeatureExtractor +from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor # extend FeatureExtractor for the sake of simplicity class FeatureCollector(FeatureExtractor): diff --git a/src/feature_extraction/feature_extractor.py b/src/feature_extraction/feature_extractors/feature_extractor.py similarity index 100% rename from src/feature_extraction/feature_extractor.py rename to src/feature_extraction/feature_extractors/feature_extractor.py diff --git a/src/feature_extraction/sentiment_fe.py b/src/feature_extraction/feature_extractors/sentiment_fe.py similarity index 93% rename from src/feature_extraction/sentiment_fe.py rename to src/feature_extraction/feature_extractors/sentiment_fe.py index 9707e4f5..440a1e2d 100644 --- a/src/feature_extraction/sentiment_fe.py +++ b/src/feature_extraction/feature_extractors/sentiment_fe.py @@ -4,7 +4,7 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import numpy as np import pandas as pd -from src.feature_extraction.feature_extractor import FeatureExtractor +from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor class SentimentFE(FeatureExtractor): diff --git a/src/preprocessing/preprocessor.py b/src/preprocessing/preprocessors/preprocessor.py similarity index 100% rename from src/preprocessing/preprocessor.py rename to src/preprocessing/preprocessors/preprocessor.py diff --git a/src/preprocessing/punctuation_remover.py b/src/preprocessing/preprocessors/punctuation_remover.py similarity index 93% rename from src/preprocessing/punctuation_remover.py rename to src/preprocessing/preprocessors/punctuation_remover.py index 2359e9a9..47839fc8 100644 --- a/src/preprocessing/punctuation_remover.py +++ b/src/preprocessing/preprocessors/punctuation_remover.py @@ -5,7 +5,7 @@ """ import string -from src.preprocessing.preprocessor import Preprocessor +from src.preprocessing.preprocessors.preprocessor import Preprocessor from src.util import COLUMN_TWEET, COLUMN_PUNCTUATION # removes punctuation from the original tweet diff --git a/src/preprocessing/tokenizer.py b/src/preprocessing/preprocessors/tokenizer.py similarity index 93% rename from src/preprocessing/tokenizer.py rename to src/preprocessing/preprocessors/tokenizer.py index 4bc3cc0f..9a3244d2 100644 --- a/src/preprocessing/tokenizer.py +++ b/src/preprocessing/preprocessors/tokenizer.py @@ -4,7 +4,7 @@ Tokenize the tweet into individual words. """ -from src.preprocessing.preprocessor import Preprocessor +from src.preprocessing.preprocessors.preprocessor import Preprocessor import nltk class Tokenizer(Preprocessor): diff --git a/src/preprocessing/run_preprocessing.py b/src/preprocessing/run_preprocessing.py index 9e84744c..bce88f6f 100644 --- a/src/preprocessing/run_preprocessing.py +++ b/src/preprocessing/run_preprocessing.py @@ -5,13 +5,14 @@ """ import argparse, csv, pickle -from numpy.core.numeric import NaN + import pandas as pd + from sklearn.pipeline import make_pipeline from src.preprocessing.preprocessors.column_dropper import ColumnDropper from src.preprocessing.preprocessors.non_english_remover import NonEnglishRemover -from src.preprocessing.punctuation_remover import PunctuationRemover -from src.preprocessing.tokenizer import Tokenizer +from src.preprocessing.preprocessors.punctuation_remover import PunctuationRemover +from src.preprocessing.preprocessors.tokenizer import Tokenizer from src.util import COLUMN_TWEET, SUFFIX_TOKENIZED