Skip to content

Commit

Permalink
feat: Add pre-processing feature
Browse files Browse the repository at this point in the history
  • Loading branch information
dekwahdimas committed Sep 9, 2024
1 parent 705b8c5 commit 02698ae
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 34 deletions.
47 changes: 47 additions & 0 deletions web_app/scripts/pre_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import re

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def nltk_pre_processing(content):
# Tokenize content
tokens = word_tokenize(content)

# Remove stop words
filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

# Lemmatize the tokens
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# Join the tokens back into a string
processed_text = ' '.join(lemmatized_tokens)

return processed_text

def do_pre_processing(filepath):
if '.csv' in filepath:
df = pd.read_csv(filepath)
elif '.xlsx' in filepath:
df = pd.read_excel(filepath)

# Clean the content/review
df['content'] = df['content'].apply(lambda x: x.lower())
df['content'] = df['content'].apply(lambda x: re.sub(r'[\W_?|$|.!_:"(-+,@#]', ' ', x))
df['content'] = df['content'].apply(lambda x: re.sub(r'\d+', ' ', x))
df['content'] = df['content'].apply(lambda x: re.sub(r'\b[a-zA-Z]\b', ' ', x))
df['content'] = df['content'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['content'] = df['content'].apply(lambda x: re.sub(r'\n', ' ', x))

# Use NLTK
df['content'] = df['content'].apply(nltk_pre_processing)

# Drop null data after being processed
df = df.dropna()

headers = df.columns.tolist()
data = df.values.tolist()

return df, headers, data
50 changes: 21 additions & 29 deletions web_app/templates/features/pre-processing.html
Original file line number Diff line number Diff line change
Expand Up @@ -13,44 +13,36 @@ <h5 class="m-0 font-weight-bold text-center">{{ context['feature_title'] }}</h5>
</div>
</div>

<div class="row">
<div class="mb-3 col-auto">
<input class="form-control" type="file" id="formFile">
<form method="POST" enctype=multipart/form-data>
<div class="row">
<div class="mb-3 col-auto">
<input class="form-control" type="file" name="file" id="formFile">
</div>
<div class="mb-3 col-auto">
<button type="submit" value="Upload" class="btn btn-primary">Submit</button>
</div>
</div>
<div class="mb-3 col-auto">
<button type="button" class="btn btn-primary">Submit</button>
</div>
</div>
</form>

{% if context['headers'] and context['data'] %}
<table id="dataTable" class="table table-bordered" style="width:100%">
<thead>
<tr>
<th>Name</th>
<th>Position</th>
<th>Office</th>
<th>Age</th>
<th>Start date</th>
<th>Salary</th>
{% for header in context['headers'] %}
<th>{{ header }}</th>
{% endfor %}
</tr>
</thead>
<tbody>
<tr>
<td>Cedric Kelly</td>
<td>Senior Javascript Developer</td>
<td>Edinburgh</td>
<td>22</td>
<td>2012-03-29</td>
<td>$433,060</td>
</tr>
<tr>
<td>Airi Satou</td>
<td>Accountant</td>
<td>Tokyo</td>
<td>33</td>
<td>2008-11-28</td>
<td>$162,700</td>
</tr>
{% for row in context['data'] %}
<tr>
{% for cell in row %}
<td>{{ cell }}</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
{% endif %}

{% endblock content %}
25 changes: 20 additions & 5 deletions web_app/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from .scripts.scraping_reviews import scrape_reviews
from .scripts.eda import explore_data
from .scripts.pre_processing import do_pre_processing

views = Blueprint('views', __name__)

Expand Down Expand Up @@ -72,13 +73,27 @@ def eda():
return render_template("features/eda.html", context={'feature_title': feature_title})


@views.route('/pre-processing')
@views.route('/pre-processing', methods=['GET', 'POST'])
def preprocessing():
feature_title = 'Data Pre-Processing'
context = {
'feature_title': feature_title,
}
return render_template("features/pre-processing.html", context=context)
if request.method == 'POST':
filepath = upload_file()
df, headers, data = do_pre_processing(filepath)

# Create the uploads folder if it doesn't exist
if not os.path.exists(current_app.config['UPLOAD_FOLDER']):
os.makedirs(current_app.config['UPLOAD_FOLDER'])

df.to_csv(os.path.join(current_app.config['UPLOAD_FOLDER'], f'pre-processed_{filepath.split("_")[1]}.csv'), index=False)

context = {
'feature_title': feature_title,
'headers': headers,
'data': data,
}
return render_template("features/pre-processing.html", context=context)

return render_template("features/pre-processing.html", context={'feature_title': feature_title})


@views.route('/modeling-evaluation')
Expand Down

0 comments on commit 02698ae

Please sign in to comment.