From 8c88c887ff5b89dceb27a9f0b627e3bce3748a12 Mon Sep 17 00:00:00 2001 From: IsmaelBortoluzzi Date: Wed, 4 May 2022 17:19:56 -0300 Subject: [PATCH 1/8] =?UTF-8?q?C=C3=B3digo=20refatorado=20para=20usar=20bi?= =?UTF-8?q?blioteca=20requests-html;=20URL=20do=20campus=20Passo=20Fundo?= =?UTF-8?q?=20corrigida?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 13 ++++++------ requirements.txt | 2 +- webscraping/get_data.py | 46 ++++++++++++++++++++++++++++------------- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index 1b46f29..285d4cd 100644 --- a/main.py +++ b/main.py @@ -22,21 +22,22 @@ async def ver_cardapio_campus(campus: str, response: Response): if campus == 'erechim': response.status_code = status.HTTP_302_FOUND return {"message": f"Campus {campus} está em desenvolvimento."} - bs = get_cardapio(campus) - if not bs: + html = get_cardapio(campus) + if not html: response.status_code = status.HTTP_404_NOT_FOUND return {"message": f"Campus {campus} não encontrado."} - cardapio = prepare_data(bs) + cardapio = prepare_data(html) response.status_code = status.HTTP_200_OK return {'cardapios': cardapio} + @app.get("/campus/{campus}/dia/{dia}") async def ver_cardapio_campus_dia(campus: str, dia: int, response: Response): - bs = get_cardapio(campus) - if not bs: + html = get_cardapio(campus) + if not html: response.status_code = status.HTTP_404_NOT_FOUND return {"message": f"Campus '{campus}' não encontrado."} - cardapios = prepare_data(bs) + cardapios = prepare_data(html) if dia not in range(0,5): response.status_code = status.HTTP_400_BAD_REQUEST return {"message": f"Informe um dia entre 0 - 4."} diff --git a/requirements.txt b/requirements.txt index a84a26d..5274225 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ fastapi -bs4 \ No newline at end of file +requests-html=0.10.0 \ No newline at end of file diff --git a/webscraping/get_data.py b/webscraping/get_data.py index 157e941..fa50daf 100644 --- a/webscraping/get_data.py +++ b/webscraping/get_data.py @@ -1,7 +1,9 @@ -from bs4 import BeautifulSoup -from urllib.request import urlopen +from requests_html import HTML, HTMLSession from urllib.error import HTTPError from unicodedata import normalize +from requests.exceptions import ChunkedEncodingError +from websockets.exceptions import ConnectionClosed + def get_value_by_position(lista: list, position: int): try: @@ -17,25 +19,40 @@ def normalize_url(url: str): def get_cardapio(campus: str): try: if campus == 'realeza': - html = urlopen(f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario/apresentacao-do-ru") + url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario/apresentacao-do-ru" + elif campus == 'passo-fundo': + url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante-universitario" else: - html = urlopen(f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario") + url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario" + + session = HTMLSession() + response = session.get(url, allow_redirects=False) + except HTTPError: return False - if html.code != 200: + except ChunkedEncodingError: + return False + except ConnectionClosed: + return False + + if response.status_code != 200: return False - return BeautifulSoup(html, 'html.parser') + + session.close() + return response.html -def prepare_data(bs: BeautifulSoup): - linhas = bs.find_all('section', {'id':'content-core'}) - conteudo_cardapios = linhas[0].findChildren('table') or linhas +def prepare_data(html: HTML): + conteudo_cardapios = html.find('table') or html.find('#content-core', first=True) + semanas = iter(html.find('#content-core p', containing='Semana ')[::-1]) cardapios = list() + for conteudo_cardapio in conteudo_cardapios: - cardapio_html = conteudo_cardapio.findChildren('td') + cardapio_html = conteudo_cardapio.find('td') + cardapio = { - 'semana': conteudo_cardapio.find_previous('p').text, - 'cardapio' : [ + 'semana': next(semanas).text, + 'cardapio': [ { 'dia': get_value_by_position(cardapio_html, key), 'salada': get_value_by_position(cardapio_html, 5+key), @@ -48,11 +65,12 @@ def prepare_data(bs: BeautifulSoup): 'mistura': get_value_by_position(cardapio_html, 40+key), 'mistura_vegana': get_value_by_position(cardapio_html, 45+key), 'sobremesa': get_value_by_position(cardapio_html, 50+key), - }for key in range(0, 5) + } for key in range(0, 5) ] } cardapios.append(cardapio) return cardapios + def get_cardapio_dia(dia: int, cardapios: list): - return list(map(lambda x: x['cardapio'][dia], cardapios)) \ No newline at end of file + return list(map(lambda x: x['cardapio'][dia], cardapios)) From 6d9f3f2f464cca47e97218acdec09ba2a62d5887 Mon Sep 17 00:00:00 2001 From: IsmaelBortoluzzi Date: Wed, 4 May 2022 17:33:14 -0300 Subject: [PATCH 2/8] =?UTF-8?q?Campus=20Erechim=20est=C3=A1=20no=20ar.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/main.py b/main.py index 285d4cd..635e7ac 100644 --- a/main.py +++ b/main.py @@ -19,9 +19,6 @@ async def home(): @app.get("/campus/{campus}") async def ver_cardapio_campus(campus: str, response: Response): - if campus == 'erechim': - response.status_code = status.HTTP_302_FOUND - return {"message": f"Campus {campus} está em desenvolvimento."} html = get_cardapio(campus) if not html: response.status_code = status.HTTP_404_NOT_FOUND From 4e85a860ac64629e0ceaed41f051e5b5adb0af54 Mon Sep 17 00:00:00 2001 From: IsmaelBortoluzzi Date: Tue, 10 May 2022 08:05:17 -0300 Subject: [PATCH 3/8] commit --- .idea/vcs.xml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From 7516b511f26cae4af23d57a349d4d8a34d0ead5a Mon Sep 17 00:00:00 2001 From: IsmaelBortoluzzi Date: Tue, 10 May 2022 08:10:10 -0300 Subject: [PATCH 4/8] Revert "adicionando readme e action para git" This reverts commit bab4fe8c69c34e3b94744c55c12dd127905a6b6b. --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a6e2f8e..523d96c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,5 +8,5 @@ jobs: - uses: actions/checkout@v2 #Be sure you check-out the repo first. Deta CLI needs access to the files - uses: BogDAAAMN/deta-deploy-action@v1.0.1 with: - deta-access-token: ${{ secrets.DETA_TOKEN }} #Deta access token https://docs.deta.sh/docs/cli/auth + deta-access-token: ${{ secrets.GCP_KEY }} #Deta access token https://docs.deta.sh/docs/cli/auth deta-name: 'apiRuUffs' #Deta Micro name https://docs.deta.sh/docs/cli/commands/#deta-clone From b6493620b63c0efa3470ca9be1b6988ab6e2fc92 Mon Sep 17 00:00:00 2001 From: IsmaelBortoluzzi Date: Tue, 10 May 2022 08:11:41 -0300 Subject: [PATCH 5/8] Revert "commit" This reverts commit 4e85a860ac64629e0ceaed41f051e5b5adb0af54. --- .idea/vcs.xml | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From a2d14e40e649f29d96418df26108fd16fba11c46 Mon Sep 17 00:00:00 2001 From: IsmaelBortoluzzi Date: Tue, 10 May 2022 08:11:47 -0300 Subject: [PATCH 6/8] =?UTF-8?q?Revert=20"Campus=20Erechim=20est=C3=A1=20no?= =?UTF-8?q?=20ar."?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 6d9f3f2f464cca47e97218acdec09ba2a62d5887. --- main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main.py b/main.py index 635e7ac..285d4cd 100644 --- a/main.py +++ b/main.py @@ -19,6 +19,9 @@ async def home(): @app.get("/campus/{campus}") async def ver_cardapio_campus(campus: str, response: Response): + if campus == 'erechim': + response.status_code = status.HTTP_302_FOUND + return {"message": f"Campus {campus} está em desenvolvimento."} html = get_cardapio(campus) if not html: response.status_code = status.HTTP_404_NOT_FOUND From 9cef6ccfd9cf74dab4300bb3551e2dcaa50d054d Mon Sep 17 00:00:00 2001 From: IsmaelBortoluzzi Date: Tue, 10 May 2022 08:11:49 -0300 Subject: [PATCH 7/8] =?UTF-8?q?Revert=20"C=C3=B3digo=20refatorado=20para?= =?UTF-8?q?=20usar=20biblioteca=20requests-html;=20URL=20do=20campus=20Pas?= =?UTF-8?q?so=20Fundo=20corrigida"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 8c88c887ff5b89dceb27a9f0b627e3bce3748a12. --- main.py | 13 ++++++------ requirements.txt | 2 +- webscraping/get_data.py | 46 +++++++++++++---------------------------- 3 files changed, 21 insertions(+), 40 deletions(-) diff --git a/main.py b/main.py index 285d4cd..1b46f29 100644 --- a/main.py +++ b/main.py @@ -22,22 +22,21 @@ async def ver_cardapio_campus(campus: str, response: Response): if campus == 'erechim': response.status_code = status.HTTP_302_FOUND return {"message": f"Campus {campus} está em desenvolvimento."} - html = get_cardapio(campus) - if not html: + bs = get_cardapio(campus) + if not bs: response.status_code = status.HTTP_404_NOT_FOUND return {"message": f"Campus {campus} não encontrado."} - cardapio = prepare_data(html) + cardapio = prepare_data(bs) response.status_code = status.HTTP_200_OK return {'cardapios': cardapio} - @app.get("/campus/{campus}/dia/{dia}") async def ver_cardapio_campus_dia(campus: str, dia: int, response: Response): - html = get_cardapio(campus) - if not html: + bs = get_cardapio(campus) + if not bs: response.status_code = status.HTTP_404_NOT_FOUND return {"message": f"Campus '{campus}' não encontrado."} - cardapios = prepare_data(html) + cardapios = prepare_data(bs) if dia not in range(0,5): response.status_code = status.HTTP_400_BAD_REQUEST return {"message": f"Informe um dia entre 0 - 4."} diff --git a/requirements.txt b/requirements.txt index 5274225..a84a26d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ fastapi -requests-html=0.10.0 \ No newline at end of file +bs4 \ No newline at end of file diff --git a/webscraping/get_data.py b/webscraping/get_data.py index fa50daf..157e941 100644 --- a/webscraping/get_data.py +++ b/webscraping/get_data.py @@ -1,9 +1,7 @@ -from requests_html import HTML, HTMLSession +from bs4 import BeautifulSoup +from urllib.request import urlopen from urllib.error import HTTPError from unicodedata import normalize -from requests.exceptions import ChunkedEncodingError -from websockets.exceptions import ConnectionClosed - def get_value_by_position(lista: list, position: int): try: @@ -19,40 +17,25 @@ def normalize_url(url: str): def get_cardapio(campus: str): try: if campus == 'realeza': - url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario/apresentacao-do-ru" - elif campus == 'passo-fundo': - url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante-universitario" + html = urlopen(f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario/apresentacao-do-ru") else: - url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario" - - session = HTMLSession() - response = session.get(url, allow_redirects=False) - + html = urlopen(f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario") except HTTPError: return False - except ChunkedEncodingError: - return False - except ConnectionClosed: - return False - - if response.status_code != 200: + if html.code != 200: return False - - session.close() - return response.html + return BeautifulSoup(html, 'html.parser') -def prepare_data(html: HTML): - conteudo_cardapios = html.find('table') or html.find('#content-core', first=True) - semanas = iter(html.find('#content-core p', containing='Semana ')[::-1]) +def prepare_data(bs: BeautifulSoup): + linhas = bs.find_all('section', {'id':'content-core'}) + conteudo_cardapios = linhas[0].findChildren('table') or linhas cardapios = list() - for conteudo_cardapio in conteudo_cardapios: - cardapio_html = conteudo_cardapio.find('td') - + cardapio_html = conteudo_cardapio.findChildren('td') cardapio = { - 'semana': next(semanas).text, - 'cardapio': [ + 'semana': conteudo_cardapio.find_previous('p').text, + 'cardapio' : [ { 'dia': get_value_by_position(cardapio_html, key), 'salada': get_value_by_position(cardapio_html, 5+key), @@ -65,12 +48,11 @@ def prepare_data(html: HTML): 'mistura': get_value_by_position(cardapio_html, 40+key), 'mistura_vegana': get_value_by_position(cardapio_html, 45+key), 'sobremesa': get_value_by_position(cardapio_html, 50+key), - } for key in range(0, 5) + }for key in range(0, 5) ] } cardapios.append(cardapio) return cardapios - def get_cardapio_dia(dia: int, cardapios: list): - return list(map(lambda x: x['cardapio'][dia], cardapios)) + return list(map(lambda x: x['cardapio'][dia], cardapios)) \ No newline at end of file From 17e2001a90eaafc6f1445a8246fab7a2e64c9bf0 Mon Sep 17 00:00:00 2001 From: IsmaelBortoluzzi Date: Tue, 10 May 2022 08:17:49 -0300 Subject: [PATCH 8/8] reverting code refactoring --- .github/workflows/build.yml | 2 +- main.py | 1 + webscraping/get_data.py | 8 ++++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 523d96c..a6e2f8e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,5 +8,5 @@ jobs: - uses: actions/checkout@v2 #Be sure you check-out the repo first. Deta CLI needs access to the files - uses: BogDAAAMN/deta-deploy-action@v1.0.1 with: - deta-access-token: ${{ secrets.GCP_KEY }} #Deta access token https://docs.deta.sh/docs/cli/auth + deta-access-token: ${{ secrets.DETA_TOKEN }} #Deta access token https://docs.deta.sh/docs/cli/auth deta-name: 'apiRuUffs' #Deta Micro name https://docs.deta.sh/docs/cli/commands/#deta-clone diff --git a/main.py b/main.py index 1b46f29..ffef5f5 100644 --- a/main.py +++ b/main.py @@ -30,6 +30,7 @@ async def ver_cardapio_campus(campus: str, response: Response): response.status_code = status.HTTP_200_OK return {'cardapios': cardapio} + @app.get("/campus/{campus}/dia/{dia}") async def ver_cardapio_campus_dia(campus: str, dia: int, response: Response): bs = get_cardapio(campus) diff --git a/webscraping/get_data.py b/webscraping/get_data.py index 157e941..4119ac7 100644 --- a/webscraping/get_data.py +++ b/webscraping/get_data.py @@ -17,9 +17,13 @@ def normalize_url(url: str): def get_cardapio(campus: str): try: if campus == 'realeza': - html = urlopen(f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario/apresentacao-do-ru") + url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario/apresentacao-do-ru" + elif campus == 'passo-fundo': + url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante-universitario" else: - html = urlopen(f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario") + url = f"https://www.uffs.edu.br/campi/{normalize_url(campus)}/restaurante_universitario" + html = urlopen(url) + except HTTPError: return False if html.code != 200: