feat: Add pre-processing feature

dekwahdimas · Sep 9, 2024 · 02698ae · 02698ae
1 parent 705b8c5
commit 02698ae
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 34 deletions.
diff --git a/web_app/scripts/pre_processing.py b/web_app/scripts/pre_processing.py
@@ -0,0 +1,47 @@
+import re
+
+import pandas as pd
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+
+def nltk_pre_processing(content):
+    # Tokenize content
+    tokens = word_tokenize(content)
+
+    # Remove stop words
+    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
+
+    # Lemmatize the tokens
+    lemmatizer = WordNetLemmatizer()
+    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
+
+    # Join the tokens back into a string
+    processed_text = ' '.join(lemmatized_tokens)
+
+    return processed_text
+
+def do_pre_processing(filepath):
+    if '.csv' in filepath:
+        df = pd.read_csv(filepath)
+    elif '.xlsx' in filepath:
+        df = pd.read_excel(filepath)
+
+    # Clean the content/review
+    df['content'] = df['content'].apply(lambda x: x.lower())
+    df['content'] = df['content'].apply(lambda x: re.sub(r'[\W_?|$|.!_:"(-+,@#]', ' ', x))
+    df['content'] = df['content'].apply(lambda x: re.sub(r'\d+', ' ', x))
+    df['content'] = df['content'].apply(lambda x: re.sub(r'\b[a-zA-Z]\b', ' ', x))
+    df['content'] = df['content'].apply(lambda x: re.sub(r'\s+', ' ', x))
+    df['content'] = df['content'].apply(lambda x: re.sub(r'\n', ' ', x))
+
+    # Use NLTK
+    df['content'] = df['content'].apply(nltk_pre_processing)
+
+    # Drop null data after being processed
+    df = df.dropna()
+
+    headers = df.columns.tolist()
+    data = df.values.tolist()
+
+    return df, headers, data
diff --git a/web_app/templates/features/pre-processing.html b/web_app/templates/features/pre-processing.html
@@ -13,44 +13,36 @@ <h5 class="m-0 font-weight-bold text-center">{{ context['feature_title'] }}</h5>
     </div>
 </div>
 
-<div class="row">
-    <div class="mb-3 col-auto">
-        <input class="form-control" type="file" id="formFile">
+<form method="POST" enctype=multipart/form-data>
+    <div class="row">
+        <div class="mb-3 col-auto">
+            <input class="form-control" type="file" name="file" id="formFile">
+        </div>
+        <div class="mb-3 col-auto">
+            <button type="submit" value="Upload" class="btn btn-primary">Submit</button>
+        </div>
     </div>
-    <div class="mb-3 col-auto">
-        <button type="button" class="btn btn-primary">Submit</button>
-    </div>
-</div>
+</form>
 
+{% if context['headers'] and context['data'] %}
 <table id="dataTable" class="table table-bordered" style="width:100%">
     <thead>
         <tr>
-            <th>Name</th>
-            <th>Position</th>
-            <th>Office</th>
-            <th>Age</th>
-            <th>Start date</th>
-            <th>Salary</th>
+            {% for header in context['headers'] %}
+            <th>{{ header }}</th>
+            {% endfor %}
         </tr>
     </thead>
     <tbody>
-        <tr>
-            <td>Cedric Kelly</td>
-            <td>Senior Javascript Developer</td>
-            <td>Edinburgh</td>
-            <td>22</td>
-            <td>2012-03-29</td>
-            <td>$433,060</td>
-        </tr>
-        <tr>
-            <td>Airi Satou</td>
-            <td>Accountant</td>
-            <td>Tokyo</td>
-            <td>33</td>
-            <td>2008-11-28</td>
-            <td>$162,700</td>
-        </tr>
+        {% for row in context['data'] %}
+            <tr>
+                {% for cell in row %}
+                <td>{{ cell }}</td>
+                {% endfor %}
+            </tr>
+        {% endfor %}
     </tbody>
 </table>
+{% endif %}
 
 {% endblock content %}
diff --git a/web_app/views.py b/web_app/views.py
@@ -7,6 +7,7 @@
 
 from .scripts.scraping_reviews import scrape_reviews
 from .scripts.eda import explore_data
+from .scripts.pre_processing import do_pre_processing
 
 views = Blueprint('views', __name__)
 
@@ -72,13 +73,27 @@ def eda():
     return render_template("features/eda.html", context={'feature_title': feature_title})
 
 
-@views.route('/pre-processing')
+@views.route('/pre-processing', methods=['GET', 'POST'])
 def preprocessing():
     feature_title = 'Data Pre-Processing'
-    context = {
-        'feature_title': feature_title,
-    }
-    return render_template("features/pre-processing.html", context=context)
+    if request.method == 'POST':
+        filepath = upload_file()
+        df, headers, data = do_pre_processing(filepath)
+
+        # Create the uploads folder if it doesn't exist
+        if not os.path.exists(current_app.config['UPLOAD_FOLDER']):
+            os.makedirs(current_app.config['UPLOAD_FOLDER'])
+
+        df.to_csv(os.path.join(current_app.config['UPLOAD_FOLDER'], f'pre-processed_{filepath.split("_")[1]}.csv'), index=False)
+
+        context = {
+            'feature_title': feature_title,
+            'headers': headers,
+            'data': data,
+        }
+        return render_template("features/pre-processing.html", context=context)
+
+    return render_template("features/pre-processing.html", context={'feature_title': feature_title})
 
 
 @views.route('/modeling-evaluation')