Skip to content

Commit

Permalink
feat: Add templates, scrape script, and routing
Browse files Browse the repository at this point in the history
  • Loading branch information
dekwahdimas committed Sep 5, 2024
1 parent b533fe4 commit 4584fa2
Show file tree
Hide file tree
Showing 13 changed files with 659 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,8 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# custom
uploads/
test_html.html
test_python.py
6 changes: 6 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from web_app import sentiment_app

app = sentiment_app()

if __name__ == '__main__':
app.run(debug=True)
16 changes: 16 additions & 0 deletions web_app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os
from flask import Flask

def sentiment_app():
app = Flask(__name__)
app.config['SECRET_KEY'] = 'KldU98e3@#rj09(dwfa)dcvP[!2]cdKajewqIwnLemNRdaqW'

UPLOAD_FOLDER = os.path.join(os.getcwd(), 'uploads')

app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

from .views import views

app.register_blueprint(views, url_prefix='/')

return app
170 changes: 170 additions & 0 deletions web_app/scripts/scraping_reviews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

from typing import List, Optional, Tuple
from datetime import datetime
from time import sleep
from tqdm import tqdm

import pandas as pd
import json
import pytz
import time

class _ContinuationToken:
__slots__ = (
"token",
"lang",
"country",
"sort",
"count",
"filter_score_with",
"filter_device_with",
)

def __init__(
self, token, lang, country, sort, count, filter_score_with, filter_device_with
):
self.token = token
self.lang = lang
self.country = country
self.sort = sort
self.count = count
self.filter_score_with = filter_score_with
self.filter_device_with = filter_device_with


def _fetch_review_items(
url: str,
app_id: str,
sort: int,
count: int,
filter_score_with: Optional[int],
filter_device_with: Optional[int],
pagination_token: Optional[str],
):
dom = post(
url,
Formats.Reviews.build_body(
app_id,
sort,
count,
"null" if filter_score_with is None else filter_score_with,
"null" if filter_device_with is None else filter_device_with,
pagination_token,
),
{"content-type": "application/x-www-form-urlencoded"},
)
match = json.loads(Regex.REVIEWS.findall(dom)[0])

return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]


def reviews(
app_id: str,
lang: str = "id",
country: str = "id",
sort: Sort = Sort.NEWEST,
count: int = 100,
filter_score_with: int = None,
filter_device_with: int = None,
continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
sort = sort.value

if continuation_token is not None:
token = continuation_token.token

if token is None:
return (
[],
continuation_token,
)

lang = continuation_token.lang
country = continuation_token.country
sort = continuation_token.sort
count = continuation_token.count
filter_score_with = continuation_token.filter_score_with
filter_device_with = continuation_token.filter_device_with
else:
token = None

url = Formats.Reviews.build(lang=lang, country=country)

_fetch_count = count

result = []

while True:
if _fetch_count == 0:
break

if _fetch_count > 200:
_fetch_count = 200

try:
review_items, token = _fetch_review_items(
url,
app_id,
sort,
_fetch_count,
filter_score_with,
filter_device_with,
token,
)
except (TypeError, IndexError):
token = continuation_token.token
continue

for review in review_items:
result.append(
{
k: spec.extract_content(review)
for k, spec in ElementSpecs.Review.items()
}
)

_fetch_count = count - len(result)

if isinstance(token, list):
token = None
break

return (
result,
_ContinuationToken(
token, lang, country, sort, count, filter_score_with, filter_device_with
),
)

def scrape_reviews(app_id, total_scrape, filter_score):
result = []
continuation_token = None
reviews_count = total_scrape # total reviews to scrape

app_id = app_id # app id to scrape
MAX_COUNT_EACH_FETCH = 200 # max scrape per each iteration
FILTER_SCORE = filter_score # if none, then scrape all ratings

with tqdm(total=reviews_count, position=0, leave=True) as pbar:
while len(result) < reviews_count:
new_result, continuation_token = reviews(
app_id,
continuation_token=continuation_token,
lang='en',
country='id',
sort=Sort.NEWEST,
filter_score_with=FILTER_SCORE,
count=MAX_COUNT_EACH_FETCH
)
if not new_result:
break
result.extend(new_result)
pbar.update(len(new_result))

df = pd.DataFrame(result)
return df
42 changes: 42 additions & 0 deletions web_app/templates/base.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Google Play Store Sentiment Analysis</title>

<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css">
<link rel="stylesheet" href="https://cdn.datatables.net/2.1.5/css/dataTables.bootstrap5.css">
</head>
<body>
<!-- Start navbar -->
{% include "navbar.html" %}
<!-- End navbar -->
<br>

<div class="container">
<!-- Start main content -->
{% block content %}

{% endblock content %}
<!-- End main content -->

<!-- Start bootstrap js -->
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r" crossorigin="anonymous"></script>
<!-- End bootstrap js -->

<!-- Start datatables js -->
<script type="text/javascript" src="https://code.jquery.com/jquery-3.7.1.js"></script>
<script type="text/javascript" src="https://cdn.datatables.net/2.1.5/js/dataTables.js"></script>
<script type="text/javascript" src="https://cdn.datatables.net/2.1.5/js/dataTables.min.js"></script>
<script type="text/javascript" src="https://cdn.datatables.net/2.1.5/js/dataTables.bootstrap5.js"></script>

<script>
$(document).ready(function () {
$('#dataTable').DataTable();
});
</script>
<!-- End datatables js -->

</body>
</html>
56 changes: 56 additions & 0 deletions web_app/templates/features/eda.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{% extends "base.html" %}

{% block content %}

<div class="card shadow mb-4">
<div class="card-header py-3">
<h5 class="m-0 font-weight-bold text-center">{{ context['feature_title'] }}</h5>
</div>
<div class="card-body">
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut tortor velit, lacinia quis diam a, volutpat aliquet sem. Maecenas eget nulla a justo tristique vulputate. Quisque efficitur ipsum a lectus lobortis ultricies. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent volutpat metus nec eros luctus pulvinar. Cras laoreet elit ut cursus laoreet. Vivamus urna ante, finibus sit amet tellus sit amet, gravida vulputate augue. Suspendisse eget purus augue. Nulla luctus sit amet metus non suscipit.
</p>
</div>
</div>

<div class="row">
<div class="mb-3 col-auto">
<input class="form-control" type="file" id="formFile">
</div>
<div class="mb-3 col-auto">
<button type="button" class="btn btn-primary">Submit</button>
</div>
</div>

<table id="dataTable" class="table table-bordered" style="width:100%">
<thead>
<tr>
<th>Name</th>
<th>Position</th>
<th>Office</th>
<th>Age</th>
<th>Start date</th>
<th>Salary</th>
</tr>
</thead>
<tbody>
<tr>
<td>Cedric Kelly</td>
<td>Senior Javascript Developer</td>
<td>Edinburgh</td>
<td>22</td>
<td>2012-03-29</td>
<td>$433,060</td>
</tr>
<tr>
<td>Airi Satou</td>
<td>Accountant</td>
<td>Tokyo</td>
<td>33</td>
<td>2008-11-28</td>
<td>$162,700</td>
</tr>
</tbody>
</table>
</div>
{% endblock content %}
56 changes: 56 additions & 0 deletions web_app/templates/features/modeling-evaluation.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{% extends "base.html" %}

{% block content %}

<div class="card shadow mb-4">
<div class="card-header py-3">
<h5 class="m-0 font-weight-bold text-center">{{ context['feature_title'] }}</h5>
</div>
<div class="card-body">
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut tortor velit, lacinia quis diam a, volutpat aliquet sem. Maecenas eget nulla a justo tristique vulputate. Quisque efficitur ipsum a lectus lobortis ultricies. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent volutpat metus nec eros luctus pulvinar. Cras laoreet elit ut cursus laoreet. Vivamus urna ante, finibus sit amet tellus sit amet, gravida vulputate augue. Suspendisse eget purus augue. Nulla luctus sit amet metus non suscipit.
</p>
</div>
</div>

<div class="row">
<div class="mb-3 col-auto">
<input class="form-control" type="file" id="formFile">
</div>
<div class="mb-3 col-auto">
<button type="button" class="btn btn-primary">Submit</button>
</div>
</div>

<table id="dataTable" class="table table-bordered" style="width:100%">
<thead>
<tr>
<th>Name</th>
<th>Position</th>
<th>Office</th>
<th>Age</th>
<th>Start date</th>
<th>Salary</th>
</tr>
</thead>
<tbody>
<tr>
<td>Cedric Kelly</td>
<td>Senior Javascript Developer</td>
<td>Edinburgh</td>
<td>22</td>
<td>2012-03-29</td>
<td>$433,060</td>
</tr>
<tr>
<td>Airi Satou</td>
<td>Accountant</td>
<td>Tokyo</td>
<td>33</td>
<td>2008-11-28</td>
<td>$162,700</td>
</tr>
</tbody>
</table>
</div>
{% endblock content %}
Loading

0 comments on commit 4584fa2

Please sign in to comment.