This repository has been archived by the owner on Aug 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
bot.py
122 lines (94 loc) · 3.56 KB
/
bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
import os
import re
import requests
import pygsheets
import callr
from pyshorteners import Shortener
from bs4 import BeautifulSoup as Bs
SEARCH_PAGE = 'http://www.pap.fr/annonce/locations-appartement-paris-18e-g37785-3-pieces-jusqu-a-1500-euros'
SPREADSHEET_URL = os.environ.get('SPREADSHEET_URL')
URL_DOMAIN = 'http://www.pap.fr'
PAGINATION_SELECTOR = '.pagination li a'
LISTING_DETAIL_BTN_SELECTOR = '.btn-details'
NEXT_PAGE_SELECTOR = '.next'
GEOLOC_SELECTOR = '.item-geoloc'
SPECS_SELECTOR = '.item-summary'
DESCRIPTION_SELECTOR = '.item-description'
METRO_SELECTOR = '.item-metro .label'
PRICE_SELECTOR = '.price'
CALLR_API_LOGIN = os.environ.get('LOGIN')
CALLR_API_PASSWORD = os.environ.get('PASSWORD')
GOOGLE_SHORTENER_API_KEY = os.environ.get('API_KEY')
PHONE = os.environ.get('PHONE')
shortener = Shortener('Google', api_key=GOOGLE_SHORTENER_API_KEY)
api = callr.Api(CALLR_API_LOGIN, CALLR_API_PASSWORD)
def get_scraped_page(url):
res = requests.get(url)
return Bs(res.text, 'lxml')
def clean_markup(string):
string = clean_special_chars(string)
return re.sub(r'<[^>]*>', '', string)
def clean_spaces(string):
string = re.sub('\n|\r|\t', ' ', string)
return re.sub('\s{2,}', ' ', string).strip()
def clean_special_chars(string):
return string.replace('²', '2').replace('€', 'e')
def process_listings_page(link):
try:
dom = get_scraped_page(link)
details_urls = [URL_DOMAIN + btn.get('href') for btn in dom.select('.btn-details')]
return [
process_listing(listing_details_url)
for listing_details_url in details_urls
]
except Exception as e:
print(e)
def process_listing(listing):
dom = get_scraped_page(listing)
print('Processing ' + listing)
specs = ' / '.join([
clean_spaces(clean_markup(str(li).replace('<strong>', ': ').lower()))
for li in dom.select(SPECS_SELECTOR)[0].select('li')
])
description_body = dom.select(DESCRIPTION_SELECTOR)[0]
location = dom.select(GEOLOC_SELECTOR)[0].h2.text
metro = ', '.join([clean_markup(elm.get_text()) for elm in dom.select(METRO_SELECTOR)])
description = clean_spaces(description_body.get_text())
price = dom.select(PRICE_SELECTOR)[0].text
return {
'specs': specs,
'location': location,
'description': description,
'metro': metro,
'url': listing,
'price': price
}
def send_data_via_sms(data):
msg = "{0} - {1} - {2} - {3} - {4}".format(
data['specs'], data['price'], data['location'], data['metro'],
shortener.short(data['url'])
)
api.call('sms.send', 'SMS', PHONE, msg, None)
try:
gc = pygsheets.authorize(service_file='credentials.json')
sheet = gc.open_by_url(SPREADSHEET_URL).sheet1
dom = get_scraped_page(SEARCH_PAGE)
links = [SEARCH_PAGE] + [
URL_DOMAIN + a.get('href')
for a in dom.select(PAGINATION_SELECTOR)
]
urls_stored = sheet.get_col(5)
for link in links:
for ls in process_listings_page(link):
if ls['url'] not in urls_stored:
sheet.insert_rows(row=0, values=[
ls['specs'], ls['location'],
ls['description'], ls['metro'], ls['url']
])
# If this is not the first time we store data (i.e. urls_stored is not empty)
# we want to receive SMS alerts with the newest listings (i.e. those we hadn't before).
if len(urls_stored) > 0:
send_data_via_sms(ls)
except Exception as e:
print(e)