diff --git a/README.md b/README.md index 9c0acba6..2bc9fd17 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,13 @@ # Python Flathunter-Helper ## Disclaimer -This script crawls websites and looks for new offers. Any changes to the webpages can break this script immediately, use with caution. +This script crawls websites and looks for new flat offers. If there is a new offer, a Telegram Bot will notify you. + +Currently the script supports the following websites: +- wg-gesucht.de +- ebay-kleinanzeigen.de +- immobilienscout24.de +- immowelt.de ## Setup @@ -15,7 +21,9 @@ git clone https://github.com/GerRudi/flathunter.git apt install python3-pip cp config.yaml.dist config.yaml nano config.yaml --> Do your edits to config file +``` +Now, do your edits to config file in nano editor +``` apt install python3-setuptools apt install python3-wheel @@ -39,7 +47,7 @@ to Telegram User optional arguments: -h, --help show this help message and exit --config CONFIG, -c CONFIG - Config file to use. If not set, try to use + Config file to use, usually 'config.yaml'. If not set, try to use '~git-clone-dir/config.yaml' ``` @@ -73,5 +81,8 @@ Since this feature is not free, I "disabled" it. Read line 62 in hunter.py to re - [@tschuehly](https://github.com/tschuehly) - [@Cugu](https://github.com/Cugu) - [@GerRudi](https://github.com/GerRudi) +- [@calbec](https://github.com/calbec) +## License +[![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0) diff --git a/config.yaml.dist b/config.yaml.dist index 1e759c60..12104aef 100644 --- a/config.yaml.dist +++ b/config.yaml.dist @@ -9,7 +9,7 @@ loop: # List the URLs in the following format: # urls: # - https://www.immobilienscout24.de/Suche/... -# - https://www.wg-gesucht.de/... +# - https://www.wg-gesucht.de/... # Use list search in wg-gesucht # - https://www.immowelt.de/... urls: @@ -26,7 +26,7 @@ urls: # # The example configuration below includes a place for # "John", located at the main train station of munich. -# Two kinds of travel (bicycle and transit) are requested, +# Three kinds of travel (bicycle, walking and transit) are requested, # each with a different label. Furthermore a place for # "Jane" is included, located at the given destination and # with the same kinds of travel. diff --git a/flathunter.py b/flathunter.py index 41c5546c..ce74a3f9 100755 --- a/flathunter.py +++ b/flathunter.py @@ -11,7 +11,7 @@ from flathunter.idmaintainer import IdMaintainer from flathunter.hunter import Hunter from flathunter.crawl_ebaykleinanzeigen import CrawlEbayKleinanzeigen -from flathunter.crawl_immowelt import CrawlImmowelt +from flathunter.crawl_immowelt import CrawlImmoWelt __author__ = "Jan Harrie" __version__ = "1.0" @@ -38,7 +38,7 @@ def launch_flat_hunt(config): - searchers = [CrawlImmobilienscout(), CrawlWgGesucht(),CrawlEbayKleinanzeigen(),CrawlImmowelt()] + searchers = [CrawlImmobilienscout(), CrawlWgGesucht(),CrawlEbayKleinanzeigen(),CrawlImmoWelt()] id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__))) hunter = Hunter() @@ -64,7 +64,7 @@ def main(): # load config config_handle = args.config __log__.info("Using config %s" % config_handle.name) - config = yaml.load(config_handle.read()) + config = yaml.load(config_handle.read(), Loader=yaml.FullLoader) # check config if not config.get('telegram', dict()).get('bot_token'): diff --git a/flathunter/crawl_ebaykleinanzeigen.py b/flathunter/crawl_ebaykleinanzeigen.py index 9667a3e5..c5763f1d 100644 --- a/flathunter/crawl_ebaykleinanzeigen.py +++ b/flathunter/crawl_ebaykleinanzeigen.py @@ -23,7 +23,8 @@ def get_results(self, search_url): return entries def get_page(self, search_url): - resp = requests.get(search_url) # TODO add page_no in url + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} + resp = requests.get(search_url, headers=headers) # TODO add page_no in url if resp.status_code != 200: self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content)) return BeautifulSoup(resp.content, 'html.parser') diff --git a/flathunter/crawl_immowelt.py b/flathunter/crawl_immowelt.py index 94d75095..74c9579f 100644 --- a/flathunter/crawl_immowelt.py +++ b/flathunter/crawl_immowelt.py @@ -1,10 +1,9 @@ -import logging -import requests -import re +# coding= UTF-8 +import logging, requests, re from bs4 import BeautifulSoup -class CrawlImmowelt: +class CrawlImmoWelt: __log__ = logging.getLogger(__name__) URL_PATTERN = re.compile(r'https://www\.immowelt\.de') @@ -30,77 +29,32 @@ def get_page(self, search_url): def extract_data(self, soup): entries = [] - soup = soup.find(id="listItemWrapperFixed") - try: - title_elements = soup.find_all("h2") - except AttributeError: - return entries - expose_ids=soup.find_all("div", class_="listitem_wrap") - - - #soup.find_all(lambda e: e.has_attr('data-adid')) - #print(expose_ids) - for idx,title_el in enumerate(title_elements): - - tags = expose_ids[idx].find_all(class_="hardfact") - url = "https://www.immowelt.de/" +expose_ids[idx].find("a").get("href") - address = expose_ids[idx].find(class_="listlocation") - address.find("span").extract() - address.find("strong").extract() - print(address.text.strip()) - address = address.text.strip() - - try: - print(tags[0].find("strong").text) - price = tags[0].find("strong").text.strip() - except IndexError: - print("Kein Preis angegeben") - price = "Auf Anfrage" - - try: - tags[1].find("div").extract() - print(tags[1].text.strip()) - size = tags[1].text.strip() - except IndexError: - size = "Nicht gegeben" - print("Quadratmeter nicht angegeben") - - try: - tags[2].find("div").extract() - print(tags[2].text.strip()) - rooms = tags[2].text.strip() - except IndexError: - print("Keine Zimmeranzahl gegeben") - rooms = "Nicht gegeben" - + soup = soup.find('div',class_ = "iw_list_content") + #print soup + results = soup.find_all(lambda e: e.has_attr('data-estateid') and not e.has_attr('data-action')) + #print results + for index,listing in enumerate(results): + price = listing.find('div',class_="hardfact price_rent").find("strong").text.strip() + id = listing.find('a').get('href').split('expose/',1)[1].split('?',1)[0].strip() + id = int(id,base=36) + url = "https://www.immowelt.de" + listing.find('a').get('href') + size = listing.find('div',class_="hardfact ").text + size = size.split('ca.)',1)[1].strip() + rooms = listing.find('div',class_="hardfact rooms").text + rooms = rooms.split('Zimmer',1)[1].strip() + address = listing.find('div',class_="listlocation ellipsis relative").text.strip() + title = listing.find('h2').text.strip() details = { - 'id': int(expose_ids[idx].get("data-estateid")), + 'id': id, 'url': url , - 'title': title_el.text.strip(), + 'title': title, 'price': price, 'size': size, 'rooms': rooms , 'address': address - } entries.append(details) - + self.__log__.debug('extracted: ' + str(entries)) return entries - - def load_address(self, url): - # extract address from expose itself - exposeHTML = requests.get(url).content - exposeSoup = BeautifulSoup(exposeHTML, 'html.parser') - try: - street_raw = exposeSoup.find(id="street-address").text - except AttributeError: - street_raw="" - try: - address_raw = exposeSoup.find(id="viewad-locality").text - except AttributeError: - address_raw ="" - address = address_raw.strip().replace("\n","") + " "+street_raw.strip() - - return address diff --git a/flathunter/crawl_wggesucht.py b/flathunter/crawl_wggesucht.py index 97ffa76a..35345017 100644 --- a/flathunter/crawl_wggesucht.py +++ b/flathunter/crawl_wggesucht.py @@ -43,7 +43,7 @@ def get_page(self, search_url, page_no): def extract_data(self, soup): entries = [] - findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad--')) + findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad-')) existingFindings = list( filter(lambda e: e.has_attr('class') and not 'listenansicht-inactive' in e['class'], findings))