Skip to content

Commit

Permalink
moves preprocessors and feature extractors in sub-folder
Browse files Browse the repository at this point in the history
  • Loading branch information
TobiObeck committed Nov 15, 2021
1 parent 1cd1d8e commit f283e8d
Show file tree
Hide file tree
Showing 12 changed files with 20 additions and 20 deletions.
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ In case this throws an error like `permission denied` or sth similar, you might
chmod -R a+x ./src
```

This gives access rights to all users for all files (recursively) in the scr directory.
This gives access rights to all users for all files (recursively) in the src directory.

### Pipeline Scripts

Expand Down Expand Up @@ -107,7 +107,7 @@ Here, `input.csv` is a csv file (ideally the output of `create_labels.py`), whil

The following flags configure which preprocessing steps are applied:

- `-p` or `--punctuation`: A new column *"tweet_no_punctuation"* is created, where all punctuation is removed from the original tweet. (See [punctuation_remover.py](src/preprocessing/punctuation_remover.py) for more details)
- `-p` or `--punctuation`: A new column *"tweet_no_punctuation"* is created, where all punctuation is removed from the original tweet. (See [punctuation_remover.py](src/preprocessing/preprocessors/punctuation_remover.py) for more details)
- `-t` or `--tokenize`: Tokenize the given column (can be specified by `--tokenize_input`, default = "tweet"), and create new column with suffix "_tokenized" containing tokenized tweet.
- `-o` or `--other`: Executes all the other preprocessing steps like the removal of non english records and the removal of unnecessary columns.

Expand Down Expand Up @@ -146,7 +146,7 @@ Here, `input.csv` is the respective training, validation, or test set file creat
- `"labels"`: a numpy array containing the target labels for the feature vectors (rows are training examples, only column is the label)

The features to be extracted can be configured with the following optional parameters:
- `-c` or `--char_length`: Count the number of characters in the "tweet" column of the data frame. (see [src/feature_extraction/character_length.py](src/feature_extraction/character_length.py))
- `-c` or `--char_length`: Count the number of characters in the "tweet" column of the data frame. (see [`character_length.py`](src/feature_extraction/feature_extractors/character_length.py))

Moreover, the script support importing and exporting fitted feature extractors with the following optional arguments:
- `-i` or `--import_file`: Load a configured and fitted feature extraction from the given pickle file. Ignore all parameters that configure the features to extract.
Expand Down Expand Up @@ -271,7 +271,6 @@ python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\
2. `launch.json` configuration to attach the editor to the already started debug process.

```json
// ...
"configurations": [
{
"name": "Python: Attach",
Expand All @@ -283,7 +282,6 @@ python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\
}
},
]
// ...
```

3. Start the attach debug configuration via the VS Code UI ([F5] key or `Run`/`Run and Debug` menu)
11 changes: 6 additions & 5 deletions src/feature_extraction/extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
"""

import argparse, csv, pickle

import pandas as pd
import numpy as np
from src.feature_extraction import character_length
from src.feature_extraction.character_length import CharacterLengthFE
from src.feature_extraction.counter_fe import CounterFE
from src.feature_extraction.feature_collector import FeatureCollector
from src.feature_extraction.sentiment_fe import SentimentFE

from src.feature_extraction.feature_extractors.character_length import CharacterLengthFE
from src.feature_extraction.feature_extractors.counter_fe import CounterFE
from src.feature_extraction.feature_extractors.feature_collector import FeatureCollector
from src.feature_extraction.feature_extractors.sentiment_fe import SentimentFE
from src.util import COLUMN_MENTIONS, COLUMN_PHOTOS, COLUMN_TWEET
from src.util import COLUMN_LABEL, COLUMN_HASHTAGS , COLUMN_URLS
from src.util import COLUMN_CASHTAGS, COLUMN_REPLY_TO, COLUMN_TWEET_TOKENIZED
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import ast
import nltk
from src.feature_extraction.feature_extractor import FeatureExtractor
from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor

class BigramFeature(FeatureExtractor):

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

import numpy as np
from src.feature_extraction.feature_extractor import FeatureExtractor
from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor

# class for extracting the character-based length as a feature
class CharacterLengthFE(FeatureExtractor):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import numpy as np
import ast
from src.feature_extraction.feature_extractor import FeatureExtractor
from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor

class CounterFE(FeatureExtractor):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

import numpy as np
from src.feature_extraction.feature_extractor import FeatureExtractor
from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor

# extend FeatureExtractor for the sake of simplicity
class FeatureCollector(FeatureExtractor):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import pandas as pd
from src.feature_extraction.feature_extractor import FeatureExtractor
from src.feature_extraction.feature_extractors.feature_extractor import FeatureExtractor


class SentimentFE(FeatureExtractor):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

import string
from src.preprocessing.preprocessor import Preprocessor
from src.preprocessing.preprocessors.preprocessor import Preprocessor
from src.util import COLUMN_TWEET, COLUMN_PUNCTUATION

# removes punctuation from the original tweet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Tokenize the tweet into individual words.
"""

from src.preprocessing.preprocessor import Preprocessor
from src.preprocessing.preprocessors.preprocessor import Preprocessor
import nltk

class Tokenizer(Preprocessor):
Expand Down
7 changes: 4 additions & 3 deletions src/preprocessing/run_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
"""

import argparse, csv, pickle
from numpy.core.numeric import NaN

import pandas as pd

from sklearn.pipeline import make_pipeline
from src.preprocessing.preprocessors.column_dropper import ColumnDropper
from src.preprocessing.preprocessors.non_english_remover import NonEnglishRemover
from src.preprocessing.punctuation_remover import PunctuationRemover
from src.preprocessing.tokenizer import Tokenizer
from src.preprocessing.preprocessors.punctuation_remover import PunctuationRemover
from src.preprocessing.preprocessors.tokenizer import Tokenizer
from src.util import COLUMN_TWEET, SUFFIX_TOKENIZED


Expand Down

0 comments on commit f283e8d

Please sign in to comment.