-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add templates, scrape script, and routing
- Loading branch information
1 parent
b533fe4
commit 4584fa2
Showing
13 changed files
with
659 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from web_app import sentiment_app | ||
|
||
app = sentiment_app() | ||
|
||
if __name__ == '__main__': | ||
app.run(debug=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import os | ||
from flask import Flask | ||
|
||
def sentiment_app(): | ||
app = Flask(__name__) | ||
app.config['SECRET_KEY'] = 'KldU98e3@#rj09(dwfa)dcvP[!2]cdKajewqIwnLemNRdaqW' | ||
|
||
UPLOAD_FOLDER = os.path.join(os.getcwd(), 'uploads') | ||
|
||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER | ||
|
||
from .views import views | ||
|
||
app.register_blueprint(views, url_prefix='/') | ||
|
||
return app |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
from google_play_scraper import Sort | ||
from google_play_scraper.constants.element import ElementSpecs | ||
from google_play_scraper.constants.regex import Regex | ||
from google_play_scraper.constants.request import Formats | ||
from google_play_scraper.utils.request import post | ||
|
||
from typing import List, Optional, Tuple | ||
from datetime import datetime | ||
from time import sleep | ||
from tqdm import tqdm | ||
|
||
import pandas as pd | ||
import json | ||
import pytz | ||
import time | ||
|
||
class _ContinuationToken: | ||
__slots__ = ( | ||
"token", | ||
"lang", | ||
"country", | ||
"sort", | ||
"count", | ||
"filter_score_with", | ||
"filter_device_with", | ||
) | ||
|
||
def __init__( | ||
self, token, lang, country, sort, count, filter_score_with, filter_device_with | ||
): | ||
self.token = token | ||
self.lang = lang | ||
self.country = country | ||
self.sort = sort | ||
self.count = count | ||
self.filter_score_with = filter_score_with | ||
self.filter_device_with = filter_device_with | ||
|
||
|
||
def _fetch_review_items( | ||
url: str, | ||
app_id: str, | ||
sort: int, | ||
count: int, | ||
filter_score_with: Optional[int], | ||
filter_device_with: Optional[int], | ||
pagination_token: Optional[str], | ||
): | ||
dom = post( | ||
url, | ||
Formats.Reviews.build_body( | ||
app_id, | ||
sort, | ||
count, | ||
"null" if filter_score_with is None else filter_score_with, | ||
"null" if filter_device_with is None else filter_device_with, | ||
pagination_token, | ||
), | ||
{"content-type": "application/x-www-form-urlencoded"}, | ||
) | ||
match = json.loads(Regex.REVIEWS.findall(dom)[0]) | ||
|
||
return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1] | ||
|
||
|
||
def reviews( | ||
app_id: str, | ||
lang: str = "id", | ||
country: str = "id", | ||
sort: Sort = Sort.NEWEST, | ||
count: int = 100, | ||
filter_score_with: int = None, | ||
filter_device_with: int = None, | ||
continuation_token: _ContinuationToken = None, | ||
) -> Tuple[List[dict], _ContinuationToken]: | ||
sort = sort.value | ||
|
||
if continuation_token is not None: | ||
token = continuation_token.token | ||
|
||
if token is None: | ||
return ( | ||
[], | ||
continuation_token, | ||
) | ||
|
||
lang = continuation_token.lang | ||
country = continuation_token.country | ||
sort = continuation_token.sort | ||
count = continuation_token.count | ||
filter_score_with = continuation_token.filter_score_with | ||
filter_device_with = continuation_token.filter_device_with | ||
else: | ||
token = None | ||
|
||
url = Formats.Reviews.build(lang=lang, country=country) | ||
|
||
_fetch_count = count | ||
|
||
result = [] | ||
|
||
while True: | ||
if _fetch_count == 0: | ||
break | ||
|
||
if _fetch_count > 200: | ||
_fetch_count = 200 | ||
|
||
try: | ||
review_items, token = _fetch_review_items( | ||
url, | ||
app_id, | ||
sort, | ||
_fetch_count, | ||
filter_score_with, | ||
filter_device_with, | ||
token, | ||
) | ||
except (TypeError, IndexError): | ||
token = continuation_token.token | ||
continue | ||
|
||
for review in review_items: | ||
result.append( | ||
{ | ||
k: spec.extract_content(review) | ||
for k, spec in ElementSpecs.Review.items() | ||
} | ||
) | ||
|
||
_fetch_count = count - len(result) | ||
|
||
if isinstance(token, list): | ||
token = None | ||
break | ||
|
||
return ( | ||
result, | ||
_ContinuationToken( | ||
token, lang, country, sort, count, filter_score_with, filter_device_with | ||
), | ||
) | ||
|
||
def scrape_reviews(app_id, total_scrape, filter_score): | ||
result = [] | ||
continuation_token = None | ||
reviews_count = total_scrape # total reviews to scrape | ||
|
||
app_id = app_id # app id to scrape | ||
MAX_COUNT_EACH_FETCH = 200 # max scrape per each iteration | ||
FILTER_SCORE = filter_score # if none, then scrape all ratings | ||
|
||
with tqdm(total=reviews_count, position=0, leave=True) as pbar: | ||
while len(result) < reviews_count: | ||
new_result, continuation_token = reviews( | ||
app_id, | ||
continuation_token=continuation_token, | ||
lang='en', | ||
country='id', | ||
sort=Sort.NEWEST, | ||
filter_score_with=FILTER_SCORE, | ||
count=MAX_COUNT_EACH_FETCH | ||
) | ||
if not new_result: | ||
break | ||
result.extend(new_result) | ||
pbar.update(len(new_result)) | ||
|
||
df = pd.DataFrame(result) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | ||
<title>Google Play Store Sentiment Analysis</title> | ||
|
||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css"> | ||
<link rel="stylesheet" href="https://cdn.datatables.net/2.1.5/css/dataTables.bootstrap5.css"> | ||
</head> | ||
<body> | ||
<!-- Start navbar --> | ||
{% include "navbar.html" %} | ||
<!-- End navbar --> | ||
<br> | ||
|
||
<div class="container"> | ||
<!-- Start main content --> | ||
{% block content %} | ||
|
||
{% endblock content %} | ||
<!-- End main content --> | ||
|
||
<!-- Start bootstrap js --> | ||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r" crossorigin="anonymous"></script> | ||
<!-- End bootstrap js --> | ||
|
||
<!-- Start datatables js --> | ||
<script type="text/javascript" src="https://code.jquery.com/jquery-3.7.1.js"></script> | ||
<script type="text/javascript" src="https://cdn.datatables.net/2.1.5/js/dataTables.js"></script> | ||
<script type="text/javascript" src="https://cdn.datatables.net/2.1.5/js/dataTables.min.js"></script> | ||
<script type="text/javascript" src="https://cdn.datatables.net/2.1.5/js/dataTables.bootstrap5.js"></script> | ||
|
||
<script> | ||
$(document).ready(function () { | ||
$('#dataTable').DataTable(); | ||
}); | ||
</script> | ||
<!-- End datatables js --> | ||
|
||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
{% extends "base.html" %} | ||
|
||
{% block content %} | ||
|
||
<div class="card shadow mb-4"> | ||
<div class="card-header py-3"> | ||
<h5 class="m-0 font-weight-bold text-center">{{ context['feature_title'] }}</h5> | ||
</div> | ||
<div class="card-body"> | ||
<p> | ||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut tortor velit, lacinia quis diam a, volutpat aliquet sem. Maecenas eget nulla a justo tristique vulputate. Quisque efficitur ipsum a lectus lobortis ultricies. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent volutpat metus nec eros luctus pulvinar. Cras laoreet elit ut cursus laoreet. Vivamus urna ante, finibus sit amet tellus sit amet, gravida vulputate augue. Suspendisse eget purus augue. Nulla luctus sit amet metus non suscipit. | ||
</p> | ||
</div> | ||
</div> | ||
|
||
<div class="row"> | ||
<div class="mb-3 col-auto"> | ||
<input class="form-control" type="file" id="formFile"> | ||
</div> | ||
<div class="mb-3 col-auto"> | ||
<button type="button" class="btn btn-primary">Submit</button> | ||
</div> | ||
</div> | ||
|
||
<table id="dataTable" class="table table-bordered" style="width:100%"> | ||
<thead> | ||
<tr> | ||
<th>Name</th> | ||
<th>Position</th> | ||
<th>Office</th> | ||
<th>Age</th> | ||
<th>Start date</th> | ||
<th>Salary</th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td>Cedric Kelly</td> | ||
<td>Senior Javascript Developer</td> | ||
<td>Edinburgh</td> | ||
<td>22</td> | ||
<td>2012-03-29</td> | ||
<td>$433,060</td> | ||
</tr> | ||
<tr> | ||
<td>Airi Satou</td> | ||
<td>Accountant</td> | ||
<td>Tokyo</td> | ||
<td>33</td> | ||
<td>2008-11-28</td> | ||
<td>$162,700</td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
</div> | ||
{% endblock content %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
{% extends "base.html" %} | ||
|
||
{% block content %} | ||
|
||
<div class="card shadow mb-4"> | ||
<div class="card-header py-3"> | ||
<h5 class="m-0 font-weight-bold text-center">{{ context['feature_title'] }}</h5> | ||
</div> | ||
<div class="card-body"> | ||
<p> | ||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut tortor velit, lacinia quis diam a, volutpat aliquet sem. Maecenas eget nulla a justo tristique vulputate. Quisque efficitur ipsum a lectus lobortis ultricies. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent volutpat metus nec eros luctus pulvinar. Cras laoreet elit ut cursus laoreet. Vivamus urna ante, finibus sit amet tellus sit amet, gravida vulputate augue. Suspendisse eget purus augue. Nulla luctus sit amet metus non suscipit. | ||
</p> | ||
</div> | ||
</div> | ||
|
||
<div class="row"> | ||
<div class="mb-3 col-auto"> | ||
<input class="form-control" type="file" id="formFile"> | ||
</div> | ||
<div class="mb-3 col-auto"> | ||
<button type="button" class="btn btn-primary">Submit</button> | ||
</div> | ||
</div> | ||
|
||
<table id="dataTable" class="table table-bordered" style="width:100%"> | ||
<thead> | ||
<tr> | ||
<th>Name</th> | ||
<th>Position</th> | ||
<th>Office</th> | ||
<th>Age</th> | ||
<th>Start date</th> | ||
<th>Salary</th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td>Cedric Kelly</td> | ||
<td>Senior Javascript Developer</td> | ||
<td>Edinburgh</td> | ||
<td>22</td> | ||
<td>2012-03-29</td> | ||
<td>$433,060</td> | ||
</tr> | ||
<tr> | ||
<td>Airi Satou</td> | ||
<td>Accountant</td> | ||
<td>Tokyo</td> | ||
<td>33</td> | ||
<td>2008-11-28</td> | ||
<td>$162,700</td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
</div> | ||
{% endblock content %} |
Oops, something went wrong.