diff --git a/scrapers/beyazperde_scraper.py b/scrapers/beyazperde_scraper.py index 69822ee..bbd760b 100644 --- a/scrapers/beyazperde_scraper.py +++ b/scrapers/beyazperde_scraper.py @@ -2,253 +2,218 @@ import time import pandas as pd from selenium import webdriver +from selenium.webdriver.chrome.webdriver import WebDriver from selenium.webdriver.common.keys import Keys -def initialize(): - - print(""" - --------------------------------------------------------- - - Beyazperde Scraper'a hoş geldiniz! - - - Geliştirici: Arda Uzunoğlu - - --------------------------------------------------------- - """) - - global film, dosya_adi, delay, review_texts, review_useful, review_not_useful, review_scores, member_name_texts, date_texts, scrape_useful, scrape_scores, scrape_member_name, scrape_date, path - - film = input("İncelemelerin Çekileceği Film: ") - dosya_adi = input("Oluşturulacak Excel dosyasının adı: ") - dosya_adi = dosya_adi + ".xlsx" - delay = int(input("Bekleme süresi: ")) - - review_texts = [] - review_useful = [] - review_not_useful = [] - review_scores = [] - member_name_texts = [] - date_texts = [] - - scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") - while (scrape_useful_input.lower() != "y") or (scrape_useful_input.lower() != "n"): - if scrape_useful_input.lower() == "y": - scrape_useful = True - break - - elif scrape_useful_input.lower() == "n": - scrape_useful = False - break - - else: - print("Geçersiz yanıt.") - scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") - print("\n") +def beyazperde_scrape(): + def initialize(): + def preference(scrape_input): + while (scrape_input.lower() != "y") or (scrape_input.lower() != "n"): + if scrape_input.lower() == "y": + output = True + break + + elif scrape_input.lower() == "n": + output = False + break + + else: + print("Geçersiz yanıt.") + scrape_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + + return output + + print(""" + --------------------------------------------------------- + - Beyazperde Scraper'a hoş geldiniz! - + - Geliştirici: Arda Uzunoğlu - + --------------------------------------------------------- + """) + + global film, file, delay, review_texts, review_useful, review_not_useful, review_scores, member_name_texts, date_texts, scrape_useful, scrape_scores, scrape_member_name, scrape_date, path + + film = input("İncelemelerin Çekileceği Film: ") + file = input("Oluşturulacak Excel dosyasının adı: ") + file = file + ".xlsx" + delay = int(input("Bekleme süresi: ")) + + review_texts = [] + review_useful = [] + review_not_useful = [] + review_scores = [] + member_name_texts = [] + date_texts = [] + + scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + scrape_scores_input = input("Filme verilen puan çekilsin mi(y/n): ") + scrape_member_name_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") + scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") + + scrape_useful = preference(scrape_useful_input) + scrape_scores = preference(scrape_scores_input) + scrape_member_name = preference(scrape_member_name_input) + scrape_date = preference(scrape_date_input) + + path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" + + def scrape(): + try: + driver = webdriver.Chrome(path) + time.sleep(delay) - scrape_scores_input = input("Filme verilen puan çekilsin mi(y/n): ") - while (scrape_scores_input.lower() != "y") or (scrape_scores_input.lower() != "n"): - if scrape_scores_input.lower() == "y": - scrape_scores = True - break + except: + print("Chromedriver kullanılamıyor.") + sys.exit() - elif scrape_scores_input.lower() == "n": - scrape_scores = False - break + try: + driver.get("http://www.beyazperde.com") + time.sleep(delay) + driver.maximize_window() + time.sleep(delay) - else: - print("Geçersiz yanıt.") - scrape_scores_input = input("Filme verilen puan çekilsin mi(y/n): ") - print("\n") + except: + print("Beyazperde'ye erişilemiyor.") + sys.exit() - scrape_member_name_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") - while (scrape_member_name_input.lower() != "y") or (scrape_member_name_input.lower() != "n"): - if scrape_member_name_input.lower() == "y": - scrape_member_name = True - break + try: + search = driver.find_element_by_class_name("header-search-input") + search.send_keys(film) + time.sleep(delay+3) - elif scrape_member_name_input.lower() == "n": - scrape_member_name = False - break + auto_complete = driver.find_element_by_class_name("autocomplete-result-title") + auto_complete.click() + time.sleep(delay) - else: - print("Geçersiz yanıt.") - scrape_member_name_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") - print("\n") + except: + print("Film bulunamadı.") + sys.exit() - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - while (scrape_date_input.lower() != "y") or (scrape_date_input.lower() != "n"): - if scrape_date_input.lower() == "y": - scrape_date = True - break + try: + member_reviews = driver.find_element_by_link_text("Üye Eleştirileri") + member_reviews.click() + time.sleep(delay) - elif scrape_date_input.lower() == "n": - scrape_date = False - break + except: + print("Film incelemeleri bulunamadı.") + sys.exit() - else: - print("Geçersiz yanıt.") - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - print("\n") - - path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - -def scrape(): - try: - driver = webdriver.Chrome(path) - time.sleep(delay) - - except: - print("Chromedriver kullanılamıyor.") - sys.exit() - - try: - driver.get("http://www.beyazperde.com") - time.sleep(delay) - driver.maximize_window() - time.sleep(delay) - - except: - print("Beyazperde'ye erişilemiyor.") - sys.exit() - - try: - search = driver.find_element_by_class_name("header-search-input") - search.send_keys(film) - time.sleep(delay+3) - - auto_complete = driver.find_element_by_class_name("autocomplete-result-title") - auto_complete.click() - time.sleep(delay) - - except: - print("Film bulunamadı.") - sys.exit() - - try: - member_reviews = driver.find_element_by_link_text("Üye Eleştirileri") - member_reviews.click() - time.sleep(delay) - - except: - print("Film incelemeleri bulunamadı.") - sys.exit() - - try: - close_banner = driver.find_element_by_id("creativeClose") - close_banner.click() - time.sleep(delay) - - except: - pass - - review_count = driver.find_element_by_class_name("titlebar-title.titlebar-title-md") - review_count = review_count.text - review_count = review_count.replace(" kullanıcı eleştirisi","") - review_count = int(review_count) + try: + close_banner = driver.find_element_by_id("creativeClose") + close_banner.click() + time.sleep(delay) - if review_count % 20 == 0: - review_page_count = review_count // 20 + except: + pass + + review_count = driver.find_element_by_class_name("titlebar-title.titlebar-title-md").text + review_count = int(review_count.replace(" kullanıcı eleştirisi","")) - else: - review_page_count = (review_count // 20) + 1 + if (review_count % 20) == 0: + review_page_count = review_count // 20 - constant_url = driver.current_url - l = 1 + else: + review_page_count = (review_count // 20) + 1 - while l <= review_page_count: + constant_url = driver.current_url - lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") - match = False + l = 1 + while l <= review_page_count: - while match == False: - lastCount = lenOfPage - time.sleep(delay) lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") - if lastCount == lenOfPage: - match = True - - print("Veriler çekiliyor...") - print("Sayfa: " + str(l)) - - reviews = driver.find_elements_by_class_name("review-card-content") - for review in reviews: - review = review.text - review_texts.append(review) - - usefuls = driver.find_elements_by_class_name("reviews-users-comment-useful") - for useful_unp in usefuls: - useful_unp = useful_unp.text - useful_unp = useful_unp.split() - - useful = useful_unp[0][0] - not_useful = useful_unp[0][1] - - review_useful.append(useful) - review_not_useful.append(not_useful) - - scores = driver.find_elements_by_class_name("stareval-note") - for score in scores: - score = score.text - score = score.replace(",0","") - review_scores.append(score) - - member_names = driver.find_elements_by_class_name("review-card-user-infos.cf") - for member_name in member_names: - member_name = member_name.text - seperation = member_name.index("\n") - member_name = member_name[:seperation] - member_name_texts.append(member_name) - - dates = driver.find_elements_by_class_name("review-card-meta-date") - for date in dates: - date = date.text - date = date.split() - date = date[:3] - date = " ".join(date) - date_texts.append(date) - - l += 1 - url = constant_url + "?page=" + str(l) - driver.get(url) - - driver.close() + match = False + + while match == False: + lastCount = lenOfPage + time.sleep(delay) + lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") + if lastCount == lenOfPage: + match = True + + print("Veriler çekiliyor...") + print("Sayfa: " + str(l)) + + reviews = driver.find_elements_by_class_name("review-card-content") + for review in reviews: + review = review.text + review_texts.append(review) + + usefuls = driver.find_elements_by_class_name("reviews-users-comment-useful") + for useful_unp in usefuls: + useful_unp = useful_unp.text + useful_unp = useful_unp.split() + + useful = useful_unp[0][0] + not_useful = useful_unp[0][1] + + review_useful.append(useful) + review_not_useful.append(not_useful) + + scores = driver.find_elements_by_class_name("stareval-note") + for score in scores: + score = score.text + score = score.replace(",0","") + review_scores.append(score) + + member_names = driver.find_elements_by_class_name("review-card-user-infos.cf") + for member_name in member_names: + member_name = member_name.text + seperation = member_name.index("\n") + member_name = member_name[:seperation] + member_name_texts.append(member_name) + + dates = driver.find_elements_by_class_name("review-card-meta-date") + for date in dates: + date = date.text + date = date.split() + date = date[:3] + date = " ".join(date) + date_texts.append(date) + + l += 1 + url = constant_url + "?page=" + str(l) + driver.get(url) + + driver.close() + + length_list = [review_texts, review_useful, review_not_useful, review_scores, member_name_texts, date_texts] + limit = map(len, length_list) + limit = min(list(limit)) + limit -= 1 + + review_texts_fin = review_texts[:limit] + df = pd.DataFrame({"İncelemeler":review_texts_fin}) + + if scrape_useful: + review_useful_fin = review_useful[:limit] + review_not_useful_fin = review_not_useful[:limit] + df["İncelemeyi Yararlı Bulanlar"] = review_useful_fin + df["İncelemeyi Yararlı Bulmayanlar"] = review_not_useful_fin + + if scrape_scores: + review_scores_fin = review_scores[:limit] + df["İnceleme Puanları"] = review_scores_fin + + if scrape_member_name: + member_name_texts_fin = member_name_texts[:limit] + df["İncelemeyi Yayınlayan Kişi"] = member_name_texts_fin + + if scrape_date: + date_texts_fin = date_texts[:limit] + df["İncelemenin Yayınlanma Tarihi"] = date_texts_fin + + df.to_excel(file, header = True, index = False) + x = "Çektiğiniz veriler "+ file + " adlı excel dosyasına kaydedildi." + print(x) + print(""" + -------------------------------------------------------------------------- + - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - + - Github Hesabım: ardauzunoglu - + -------------------------------------------------------------------------- + """) + time.sleep(3) - kisa = [len(review_texts), len(member_name_texts), len(date_texts), len(review_useful), len(review_not_useful)] - kisa = min(kisa) - kisa -= 1 - - review_texts_fin = review_texts[:kisa] - df = pd.DataFrame({"İncelemeler":review_texts_fin}) - - if scrape_useful: - review_useful_fin = review_useful[:kisa] - review_not_useful_fin = review_not_useful[:kisa] - df["İncelemeyi Yararlı Bulanlar"] = review_useful_fin - df["İncelemeyi Yararlı Bulmayanlar"] = review_not_useful_fin - - if scrape_scores: - review_scores_fin = review_scores[:kisa] - df["İnceleme Puanları"] = review_scores_fin - - if scrape_member_name: - member_name_texts_fin = member_name_texts[:kisa] - df["İncelemeyi Yayınlayan Kişi"] = member_name_texts_fin - - if scrape_date: - date_texts_fin = date_texts[:kisa] - df["İncelemenin Yayınlanma Tarihi"] = date_texts_fin - - df.to_excel(dosya_adi, header = True, index = False) - - x = "Çektiğiniz veriler "+ dosya_adi + " adlı excel dosyasına kaydedildi." - print(x) - - print(""" - -------------------------------------------------------------------------- - - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - - - Github Hesabım: ardauzunoglu - - -------------------------------------------------------------------------- - """) - - time.sleep(3) + initialize() + scrape() if __name__ == "__main__": - initialize() - scrape() \ No newline at end of file + beyazperde_scrape() \ No newline at end of file diff --git a/scrapers/eksi_scraper.py b/scrapers/eksi_scraper.py index 94b26a8..c7a4409 100644 --- a/scrapers/eksi_scraper.py +++ b/scrapers/eksi_scraper.py @@ -5,27 +5,36 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys -def initialize(): - - print(""" - --------------------------------------------------------- - - Ekşi Sözlük Scraper'a hoş geldiniz! - - - Geliştirici: Arda Uzunoğlu - - --------------------------------------------------------- - """) - - global path - - path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - -def scrape(): - def spesific_initialize(): - - global baslik, dosya_adi, delay, entry_texts, author_texts, date_texts, scrape_author_input, scrape_date_input, scrape_author, scrape_date - - baslik = input("Entrylerin çekileceği başlık: ") - dosya_adi = input("Oluşturulacak Excel dosyasının adı: ") - dosya_adi = dosya_adi + ".xlsx" +def eksisozluk_scrape(): + def initialize(): + def preference(scrape_input): + while (scrape_input.lower() != "y") or (scrape_input.lower() != "n"): + if scrape_input.lower() == "y": + output = True + break + + elif scrape_input.lower() == "n": + output = False + break + + else: + print("Geçersiz yanıt.") + scrape_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + + return output + + print(""" + --------------------------------------------------------- + - Ekşi Sözlük Scraper'a hoş geldiniz! - + - Geliştirici: Arda Uzunoğlu - + --------------------------------------------------------- + """) + + global title, file, delay, entry_texts, author_texts, date_texts, scrape_author_input, scrape_date_input, scrape_author, scrape_date, path + + title = input("Entrylerin çekileceği başlık: ") + file = input("Oluşturulacak Excel dosyasının adı: ") + file = file + ".xlsx" delay = int(input("Bekleme süresi(sn): ")) entry_texts = [] @@ -33,36 +42,14 @@ def spesific_initialize(): date_texts = [] scrape_author_input = input("Yazar isimleri çekilsin mi(y/n): ") - while (scrape_author_input.lower() != "y") or (scrape_author_input.lower() != "n"): - if scrape_author_input.lower() == "y": - scrape_author = True - break - - elif scrape_author_input.lower() == "n": - scrape_author = False - break - - else: - print("Geçersiz yanıt.") - scrape_author_input = input("Yazar isimleri çekilsin mi(y/n): ") - print("\n") - scrape_date_input = input("Entry tarihleri çekilsin mi(y/n): ") - while (scrape_date_input.lower() != "y") or (scrape_date_input.lower() != "n"): - if scrape_date_input.lower() == "y": - scrape_date = True - break - elif scrape_date_input.lower() == "n": - scrape_date = False - break + scrape_author = preference(scrape_author_input) + scrape_date = preference(scrape_date_input) - else: - print("Geçersiz yanıt.") - scrape_date_input = input("Entry tarihleri çekilsin mi(y/n): ") - print("\n") + path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - def spesific_scrape(): + def scrape(): try: print("Chromedriver'a erişiliyor...") driver = webdriver.Chrome(path) @@ -88,7 +75,7 @@ def spesific_scrape(): try: print("Başlık aranıyor...") arama_bari = driver.find_element_by_id("search-textbox") - arama_bari.send_keys(baslik) + arama_bari.send_keys(title) arama_bari.send_keys(Keys.ENTER) time.sleep(delay) print("Başlık bulundu.") @@ -114,15 +101,13 @@ def spesific_scrape(): time.sleep(delay) entries = driver.find_elements_by_css_selector(".content") - for entry in entries: entry = entry.text entry_texts.append(entry) time.sleep(delay) - dates = driver.find_elements_by_class_name("entry-date") - + dates = driver.find_elements_by_class_name("entry-date") for date in dates: date = date.text date_texts.append(date) @@ -130,7 +115,6 @@ def spesific_scrape(): time.sleep(delay) authors = driver.find_elements_by_class_name("entry-author") - for author in authors: author = author.text author_texts.append(author) @@ -151,41 +135,40 @@ def spesific_scrape(): pass driver.close() - kisa = [len(entry_texts), len(author_texts), len(date_texts)] - kisa = min(kisa) - kisa -= 1 - entry_texts_fin = entry_texts[:kisa] + length_list = [entry_texts, author_texts, date_texts] + limit = map(len, length_list) + limit = min(list(limit)) + limit -= 1 + + entry_texts_fin = entry_texts[:limit] df = pd.DataFrame({"Entryler": entry_texts_fin}) if scrape_date: - date_texts_fin = date_texts[:kisa] + date_texts_fin = date_texts[:limit] df["Tarihler"] = date_texts_fin if scrape_author: - author_texts_fin = author_texts[:kisa] + author_texts_fin = author_texts[:limit] df["Yazarlar"] = author_texts_fin - df.to_excel(dosya_adi, header = True, index = False) + df.to_excel(file, header = True, index = False) print("Başlık kazıması tamamlandı.") - print("Çektiğiniz veriler "+ dosya_adi + " adlı excel dosyasına kaydedildi.") + print("Çektiğiniz veriler "+ file + " adlı excel dosyasına kaydedildi.") - spesific_initialize() - spesific_scrape() -def end(): + print(""" + -------------------------------------------------------------------------- + - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - + - Github Hesabım: ardauzunoglu - + -------------------------------------------------------------------------- + """) - print(""" - -------------------------------------------------------------------------- - - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - - - Github Hesabım: ardauzunoglu - - -------------------------------------------------------------------------- - """) + time.sleep(3) - time.sleep(3) - -if __name__ == "__main__": initialize() scrape() - end() \ No newline at end of file + +if __name__ == "__main__": + eksisozluk_scrape() \ No newline at end of file diff --git a/scrapers/hepsiburada_scraper.py b/scrapers/hepsiburada_scraper.py index 941ac9a..bc2382e 100644 --- a/scrapers/hepsiburada_scraper.py +++ b/scrapers/hepsiburada_scraper.py @@ -4,303 +4,263 @@ from selenium import webdriver from selenium.webdriver.common.keys import Keys -def initialize(): - - print(""" - --------------------------------------------------------- - - Hepsiburada Scraper'a hoş geldiniz! - - - Geliştirici: Arda Uzunoğlu - - --------------------------------------------------------- - """) - - global urun_adi, dosya_adi, delay, review_texts, review_useful, review_not_useful, customer_name_texts, customer_province_texts, customer_age_texts, date_texts, scrape_useful, scrape_customer_name, scrape_customer_province, scrape_customer_age, scrape_date, path - - urun_adi = input("Değerlendirmelerin çekileceği ürün adı: ") - dosya_adi = input("Oluşturulacak Excel dosyasının adı: ") - dosya_adi = dosya_adi + ".xlsx" - delay = int(input("Bekleme süresi: ")) - - review_texts = [] - review_useful = [] - review_not_useful = [] - customer_name_texts = [] - customer_province_texts = [] - customer_age_texts = [] - date_texts = [] - - scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") - while (scrape_useful_input.lower() != "y") or (scrape_useful_input.lower() != "n"): - if scrape_useful_input.lower() == "y": - scrape_useful = True - break - - elif scrape_useful_input.lower() == "n": - scrape_useful = False - break - - else: - print("Geçersiz yanıt.") - scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") - print("\n") - - scrape_customer_name_input = input("Müşteri isimleri çekilsin mi(y/n): ") - while (scrape_customer_name_input.lower() != "y") or (scrape_customer_name_input.lower() != "n"): - if scrape_customer_name_input.lower() == "y": - scrape_customer_name = True - break - - elif scrape_customer_name_input.lower() == "n": - scrape_customer_name = False - break - - else: - print("Geçersiz yanıt.") - scrape_customer_name_input = input("Müşteri isimleri çekilsin mi(y/n): ") - print("\n") - - scrape_customer_province_input = input("Müşteri konumları çekilsin mi(y/n): ") - while (scrape_customer_province_input.lower() != "y") or (scrape_customer_province_input.lower() != "n"): - if scrape_customer_province_input.lower() == "y": - scrape_customer_province = True - break - - elif scrape_customer_province_input.lower() == "n": - scrape_customer_province = False - break - - else: - print("Geçersiz yanıt.") - scrape_customer_province_input = input("Müşteri konumları çekilsin mi(y/n): ") - print("\n") - - scrape_customer_age_input = input("Müşteri yaşları çekilsin mi(y/n): ") - while (scrape_customer_age_input.lower() != "y") or (scrape_customer_age_input.lower() != "n"): - if scrape_customer_age_input.lower() == "y": - scrape_customer_age = True - break - - elif scrape_customer_age_input.lower() == "n": - scrape_customer_age = False - break - - else: - print("Geçersiz yanıt.") - scrape_customer_age_input = input("Müşteri yaşları çekilsin mi(y/n): ") - print("\n") - - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - while (scrape_date_input.lower() != "y") or (scrape_date_input.lower() != "n"): - if scrape_date_input.lower() == "y": - scrape_date = True - break - - elif scrape_date_input.lower() == "n": - scrape_date = False - break - - else: - print("Geçersiz yanıt.") - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - print("\n") - - path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - -def scrape(): - try: - driver = webdriver.Chrome(path) - time.sleep(delay) - - except: - print("Chromedriver kullanılamıyor.") - sys.exit() - - try: - driver.get("https://www.hepsiburada.com") - time.sleep(delay) - driver.maximize_window() - time.sleep(delay) - - except: - print("Hepsiburada'ya erişilemiyor.") - sys.exit() - - try: - arama_bari = driver.find_element_by_class_name("desktopOldAutosuggestTheme-input") - arama_bari.send_keys(urun_adi) - arama_bari.send_keys(Keys.ENTER) - time.sleep(delay) - urun = driver.find_element_by_class_name("search-item") - urun.click() - time.sleep(delay) - - except: - print("Ürün bulunamadı.") - sys.exit() - - try: - review_count = driver.find_element_by_id("productReviewsTab").text - review_count = review_count.replace("Değerlendirmeler ", "") - review_count = review_count.replace("(","") - review_count = review_count.replace(")","") - review_count = int(review_count) - if review_count % 30 == 0: - review_page_count = review_count // 10 - - else: - review_page_count = (review_count // 10) + 1 - - constant_url = driver.current_url - - except: - print("İnceleme bulunamadı.") - sys.exit() - - try: - index_of_question_mark = constant_url.index("?") - constant_url = constant_url[:index_of_question_mark] - - except: - pass - - i = 1 - while i <= review_page_count: - - url = constant_url + "-yorumlari?sayfa=" + str(i) - driver.get(url) - - print("Veriler çekiliyor...") - print("Sayfa: " + str(i)) - - reviews = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//span[@itemprop='description']") - for review in reviews: - review = review.text - review_texts.append(review) - - customer_names = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//span[@itemprop='author']") - for customer_name in customer_names: - customer_name = customer_name.text - customer_name_texts.append(customer_name) - - customer_ages = driver.find_elements_by_xpath("//*[@class='hermes-ReviewCard-module-1-Wp3']//span[2]") - for customer_age in customer_ages: - customer_age = customer_age.text - customer_age = customer_age.replace("(", "") - customer_age = customer_age.replace(")", "") - - if customer_age == "": - customer_age = "Boş" - - customer_age_texts.append(customer_age) - - customer_provinces = driver.find_elements_by_xpath("//*[@class='hermes-ReviewCard-module-1-Wp3']//span[3]") - for customer_province in customer_provinces: - customer_province = customer_province.text - customer_province = customer_province.replace("-", "") - customer_province = customer_province.replace(" ", "") - customer_province_texts.append(customer_province) - - dates = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//span[@itemprop='datePublished']") - for date in dates: - date = date.text - date = date.replace(",", "") - date = date.split() - - day_conv = { - "Pts":"Pazartesi", - "Sal":"Salı", - "Çar":"Çarşamba", - "Per":"Perşembe", - "Cum":"Cuma", - "Cts":"Cumartesi", - "Paz":"Pazar", - "Pazartesi":"Pazartesi", - "Salı":"Salı", - "Çarşamba":"Çarşamba", - "Perşembe":"Perşembe", - "Cuma":"Cuma", - "Cumartesi":"Cumartesi", - "Pazar":"Pazar" - } - - years = ["2019", "2018", "2017", "2016", "2015", "2014", "2013", "2012", "2011", "2010", "2009", "2008", "2007", "2006", "2005", "2004", "2003", "2002", "2001", "2000"] - - if date[2] not in years: - date.insert(2, "2020") - - date[-1] = day_conv[date[-1]] - date = " ".join(date) - date_texts.append(date) - - usefuls = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//button[@class='hermes-ReviewCard-module-1MoiF']") - not_usefuls = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//button[@class='hermes-ReviewCard-module-39K0Y']") - - for useful in usefuls: - useful = useful.text - useful = useful.replace("Evet", "") - useful = useful.replace("(", "") - useful = useful.replace(")", "") - review_useful.append(useful) +def hepsiburada_scrape(): + def initialize(): + def preference(scrape_input): + while (scrape_input.lower() != "y") or (scrape_input.lower() != "n"): + if scrape_input.lower() == "y": + output = True + break + + elif scrape_input.lower() == "n": + output = False + break + + else: + print("Geçersiz yanıt.") + scrape_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + + return output + + print(""" + --------------------------------------------------------- + - Hepsiburada Scraper'a hoş geldiniz! - + - Geliştirici: Arda Uzunoğlu - + --------------------------------------------------------- + """) + + global product_name, file, delay, review_texts, review_useful, review_not_useful, customer_name_texts, customer_province_texts, customer_age_texts, date_texts, scrape_useful, scrape_customer_name, scrape_customer_province, scrape_customer_age, scrape_date, path + + product_name = input("Değerlendirmelerin çekileceği ürün adı: ") + file = input("Oluşturulacak Excel dosyasının adı: ") + file = file + ".xlsx" + delay = int(input("Bekleme süresi: ")) + + review_texts = [] + review_useful = [] + review_not_useful = [] + customer_name_texts = [] + customer_province_texts = [] + customer_age_texts = [] + date_texts = [] + + scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + scrape_useful = preference(scrape_useful_input) + + scrape_customer_name_input = input("Müşteri isimleri çekilsin mi(y/n): ") + scrape_customer_name = preference(scrape_customer_name_input) + + scrape_customer_province_input = input("Müşteri konumları çekilsin mi(y/n): ") + scrape_customer_province = preference(scrape_customer_province_input) + + scrape_customer_age_input = input("Müşteri yaşları çekilsin mi(y/n): ") + scrape_customer_age = preference(scrape_customer_age_input) + + scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") + scrape_date = preference(scrape_date_input) - for not_useful in not_usefuls: - not_useful = not_useful.text - not_useful = not_useful.replace("Hayır", "") - not_useful = not_useful.replace("(", "") - not_useful = not_useful.replace(")", "") - review_not_useful.append(not_useful) - - while len(review_useful) < len(date_texts): - review_useful.append("0") - review_not_useful.append("0") - - while len(review_texts) < len(date_texts): - review_texts.append("Boş") - - i += 1 - - driver.close() - kisa = [len(review_texts), len(review_useful), len(review_not_useful), len(date_texts), len(customer_name_texts), len(customer_age_texts), len(customer_province_texts)] - kisa = min(kisa) - kisa -= 1 - - review_texts_fin = review_texts[:kisa] - df = pd.DataFrame({"Değerlendirme: ":review_texts_fin}) - - if scrape_useful: - review_useful_fin = review_useful[:kisa] - review_not_useful_fin = review_not_useful[:kisa] - df["Değerlendirmeyi Yararlı Bulanlar"] = review_useful_fin - df["Değerlendirmeyi Yararlı Bulmayanlar"] = review_not_useful_fin - - if scrape_date: - date_texts_fin = date_texts[:kisa] - df["Değerlendirme Tarihi:"] = date_texts_fin - - if scrape_customer_name: - customer_name_texts_fin = customer_name_texts[:kisa] - df["Müşterinin Adı Soyadı"] = customer_name_texts_fin - - if scrape_customer_age: - customer_age_texts_fin = customer_age_texts[:kisa] - df["Müşterinin Yaşı"] = customer_age_texts_fin - - if scrape_customer_province: - customer_province_texts_fin = customer_province_texts[:kisa] - df["Müşterinin Konumu"] = customer_province_texts_fin - - df.to_excel(dosya_adi, header = True, index = False) - - x = "Çektiğiniz veriler "+ dosya_adi + " adlı excel dosyasına kaydedildi." - print(x) - - print(""" - -------------------------------------------------------------------------- - - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - - - Github Hesabım: ardauzunoglu - - -------------------------------------------------------------------------- - """) - - time.sleep(3) + path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" + + def scrape(): + try: + driver = webdriver.Chrome(path) + time.sleep(delay) + + except: + print("Chromedriver kullanılamıyor.") + sys.exit() + + try: + driver.get("https://www.hepsiburada.com") + time.sleep(delay) + driver.maximize_window() + time.sleep(delay) + + except: + print("Hepsiburada'ya erişilemiyor.") + sys.exit() + + try: + arama_bari = driver.find_element_by_class_name("desktopOldAutosuggestTheme-input") + arama_bari.send_keys(product_name) + arama_bari.send_keys(Keys.ENTER) + time.sleep(delay) + urun = driver.find_element_by_class_name("search-item") + urun.click() + time.sleep(delay) + + except: + print("Ürün bulunamadı.") + sys.exit() + + try: + review_count = driver.find_element_by_id("productReviewsTab").text + review_count = review_count.replace("Değerlendirmeler ", "") + review_count = review_count.replace("(","") + review_count = review_count.replace(")","") + review_count = int(review_count) + if review_count % 30 == 0: + review_page_count = review_count // 10 + + else: + review_page_count = (review_count // 10) + 1 + + constant_url = driver.current_url + + except: + print("İnceleme bulunamadı.") + sys.exit() + + try: + index_of_question_mark = constant_url.index("?") + constant_url = constant_url[:index_of_question_mark] + + except: + pass + + i = 1 + while i <= review_page_count: + + url = constant_url + "-yorumlari?sayfa=" + str(i) + driver.get(url) + + print("Veriler çekiliyor...") + print("Sayfa: " + str(i)) + + reviews = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//span[@itemprop='description']") + for review in reviews: + review = review.text + review_texts.append(review) + + customer_names = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//span[@itemprop='author']") + for customer_name in customer_names: + customer_name = customer_name.text + customer_name_texts.append(customer_name) + + customer_ages = driver.find_elements_by_xpath("//*[@class='hermes-ReviewCard-module-1-Wp3']//span[2]") + for customer_age in customer_ages: + customer_age = customer_age.text + customer_age = customer_age.replace("(", "") + customer_age = customer_age.replace(")", "") + + if customer_age == "": + customer_age = "Boş" + + customer_age_texts.append(customer_age) + + customer_provinces = driver.find_elements_by_xpath("//*[@class='hermes-ReviewCard-module-1-Wp3']//span[3]") + for customer_province in customer_provinces: + customer_province = customer_province.text + customer_province = customer_province.replace("-", "") + customer_province = customer_province.replace(" ", "") + customer_province_texts.append(customer_province) + + dates = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//span[@itemprop='datePublished']") + for date in dates: + date = date.text + date = date.replace(",", "") + date = date.split() + + day_conv = { + "Pts":"Pazartesi", + "Sal":"Salı", + "Çar":"Çarşamba", + "Per":"Perşembe", + "Cum":"Cuma", + "Cts":"Cumartesi", + "Paz":"Pazar", + "Pazartesi":"Pazartesi", + "Salı":"Salı", + "Çarşamba":"Çarşamba", + "Perşembe":"Perşembe", + "Cuma":"Cuma", + "Cumartesi":"Cumartesi", + "Pazar":"Pazar" + } + + years = ["2019", "2018", "2017", "2016", "2015", "2014", "2013", "2012", "2011", "2010", "2009", "2008", "2007", "2006", "2005", "2004", "2003", "2002", "2001", "2000"] + + if date[2] not in years: + date.insert(2, "2020") + + date[-1] = day_conv[date[-1]] + date = " ".join(date) + date_texts.append(date) + + usefuls = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//button[@class='hermes-ReviewCard-module-1MoiF']") + not_usefuls = driver.find_elements_by_xpath("//*[@id='hermes-voltran-comments']//button[@class='hermes-ReviewCard-module-39K0Y']") + + for useful in usefuls: + useful = useful.text + useful = useful.replace("Evet", "") + useful = useful.replace("(", "") + useful = useful.replace(")", "") + review_useful.append(useful) + + for not_useful in not_usefuls: + not_useful = not_useful.text + not_useful = not_useful.replace("Hayır", "") + not_useful = not_useful.replace("(", "") + not_useful = not_useful.replace(")", "") + review_not_useful.append(not_useful) + + while len(review_useful) < len(date_texts): + review_useful.append("0") + review_not_useful.append("0") + + while len(review_texts) < len(date_texts): + review_texts.append("Boş") + + i += 1 + + driver.close() + + length_list = [review_texts, review_useful, review_not_useful, date_texts, customer_name_texts, customer_age_texts, customer_province_texts] + limit = map(len, length_list) + limit = min(list(limit)) + limit -= 1 + + review_texts_fin = review_texts[:limit] + df = pd.DataFrame({"Değerlendirme: ":review_texts_fin}) + + if scrape_useful: + review_useful_fin = review_useful[:limit] + review_not_useful_fin = review_not_useful[:limit] + df["Değerlendirmeyi Yararlı Bulanlar"] = review_useful_fin + df["Değerlendirmeyi Yararlı Bulmayanlar"] = review_not_useful_fin + + if scrape_date: + date_texts_fin = date_texts[:limit] + df["Değerlendirme Tarihi:"] = date_texts_fin + + if scrape_customer_name: + customer_name_texts_fin = customer_name_texts[:limit] + df["Müşterinin Adı Soyadı"] = customer_name_texts_fin + + if scrape_customer_age: + customer_age_texts_fin = customer_age_texts[:limit] + df["Müşterinin Yaşı"] = customer_age_texts_fin + + if scrape_customer_province: + customer_province_texts_fin = customer_province_texts[:limit] + df["Müşterinin Konumu"] = customer_province_texts_fin + + df.to_excel(file, header = True, index = False) + + x = "Çektiğiniz veriler "+ file + " adlı excel dosyasına kaydedildi." + print(x) + + print(""" + -------------------------------------------------------------------------- + - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - + - Github Hesabım: ardauzunoglu - + -------------------------------------------------------------------------- + """) + + time.sleep(3) -if __name__ == "__main__": initialize() - scrape() \ No newline at end of file + scrape() + +if __name__ == "__main__": + hepsiburada_scrape() \ No newline at end of file diff --git a/scrapers/ismail.py b/scrapers/ismail.py new file mode 100644 index 0000000..e93712e --- /dev/null +++ b/scrapers/ismail.py @@ -0,0 +1,10 @@ +def degerkontrol(): + girdi = input("Değer: ") + while not girdi.isdigit(): + print("tekrar") + girdi = input("Değer: ") + + else: + return "bu bir sayi" + +print(degerkontrol()) \ No newline at end of file diff --git a/scrapers/kitapyurdu_scraper.py b/scrapers/kitapyurdu_scraper.py index ac62eba..2e1c62c 100644 --- a/scrapers/kitapyurdu_scraper.py +++ b/scrapers/kitapyurdu_scraper.py @@ -4,230 +4,213 @@ from selenium import webdriver from selenium.webdriver.common.keys import Keys -def initialize(): - - print(""" - --------------------------------------------------------- - - Kitapyurdu Scraper'a hoş geldiniz! - - - Geliştirici: Arda Uzunoğlu - - --------------------------------------------------------- - """) - - global kitap, dosya_adi, delay, review_texts, review_useful, review_not_useful, author_texts, date_texts, scrape_useful, scrape_author, scrape_date, path - - kitap = input("İncelemelerin Çekileceği Kitap Adı: ") - dosya_adi = input("Oluşturulacak Excel dosyasının adı: ") - dosya_adi = dosya_adi + ".xlsx" - delay = int(input("Bekleme süresi: ")) - - review_texts = [] - review_useful = [] - review_not_useful = [] - author_texts = [] - date_texts = [] - - scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") - while (scrape_useful_input.lower() != "y") or (scrape_useful_input.lower() != "n"): - if scrape_useful_input.lower() == "y": - scrape_useful = True - break - - elif scrape_useful_input.lower() == "n": - scrape_useful = False - break - - else: - print("Geçersiz yanıt.") - scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") - print("\n") - - scrape_author_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") - while (scrape_author_input.lower() != "y") or (scrape_author_input.lower() != "n"): - if scrape_author_input.lower() == "y": - scrape_author = True - break - - elif scrape_author_input.lower() == "n": - scrape_author = False - break - - else: - print("Geçersiz yanıt.") - scrape_author_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") - print("\n") - - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - while (scrape_date_input.lower() != "y") or (scrape_date_input.lower() != "n"): - if scrape_date_input.lower() == "y": - scrape_date = True - break - - elif scrape_date_input.lower() == "n": - scrape_date = False - break - - else: - print("Geçersiz yanıt.") - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - print("\n") - - path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - -def scrape(): - try: - driver = webdriver.Chrome(path) - time.sleep(delay) - - except: - print("Chromedriver kullanılamıyor.") - sys.exit() - - try: - driver.get("https://www.kitapyurdu.com") - driver.maximize_window() - time.sleep(delay) - - except: - print("Kitapyurdu'na erişilemiyor.") - sys.exit() - - try: - search = driver.find_element_by_id("search-input") - search.send_keys(kitap) - search.send_keys(Keys.ENTER) - - time.sleep(delay) - +def kitapyurdu_scrape(): + def initialize(): + def preference(scrape_input): + while (scrape_input.lower() != "y") or (scrape_input.lower() != "n"): + if scrape_input.lower() == "y": + output = True + break + + elif scrape_input.lower() == "n": + output = False + break + + else: + print("Geçersiz yanıt.") + scrape_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + + return output + print(""" + --------------------------------------------------------- + - Kitapyurdu Scraper'a hoş geldiniz! - + - Geliştirici: Arda Uzunoğlu - + --------------------------------------------------------- + """) + + global book, file, delay, review_texts, review_useful, review_not_useful, author_texts, date_texts, scrape_useful, scrape_author, scrape_date, path + + book = input("İncelemelerin Çekileceği Kitap Adı: ") + file = input("Oluşturulacak Excel dosyasının adı: ") + file = file + ".xlsx" + delay = int(input("Bekleme süresi: ")) + + review_texts = [] + review_useful = [] + review_not_useful = [] + author_texts = [] + date_texts = [] + + scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + scrape_useful = preference(scrape_useful_input) + + scrape_author_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") + scrape_author = preference(scrape_author_input) + + scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") + scrape_date = preference(scrape_date_input) + + path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" + + def scrape(): try: - close_notification = driver.find_element_by_class_name("opt-in-disallow-button") - close_notification.click() + driver = webdriver.Chrome(path) + time.sleep(delay) except: - pass - - time.sleep(delay) - - product = driver.find_element_by_class_name("name.ellipsis") - product.click() - time.sleep(delay) - - except: - print("Kitap bulunamadı.") - sys.exit() - - try: - reviewsTab = driver.find_element_by_class_name("pr__htabs-review-text") - reviewsTab.click() - time.sleep(delay) - - except: - print("Kitap incelemeleri bulunamadı.") - sys.exit() + print("Chromedriver kullanılamıyor.") + sys.exit() - l = 1 - review_length = reviewsTab.text.replace("Yorumlar","") - - try: - review_length = review_length.replace(".","") - review_length = int(review_length) - - except: - review_length = int(review_length) - - if review_length % 5 == 0: - review_length = review_length // 5 - else: - review_length = (review_length // 5) + 1 - - while l <= review_length: - - lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") - match = False - - while match == False: - lastCount = lenOfPage + try: + driver.get("https://www.kitapyurdu.com") + driver.maximize_window() time.sleep(delay) - lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") - if lastCount == lenOfPage: - match = True - print("Veriler çekiliyor...") - print("Sayfa: " + str(l)) + except: + print("Kitapyurdu'na erişilemiyor.") + sys.exit() try: - reviews = driver.find_elements_by_class_name("review-text") - for review in reviews: - review = review.text - review_texts.append(review) - - authors = driver.find_elements_by_xpath("//a[@class ='alt']//span[@itemprop='name']") - for author in authors: - author = author.text - author_texts.append(author) - - dates = driver.find_elements_by_class_name("review-date") - for date in dates: - date = date.text - date_texts.append(date) - - usefuls = driver.find_elements_by_xpath("//div[@class ='agree']//span[@class='count']") - for useful in usefuls: - useful = useful.text - review_useful.append(useful) - - not_usefuls = driver.find_elements_by_xpath("//div[@class ='disagree']//span[@class='count']") - for not_useful in not_usefuls: - not_useful = not_useful.text - review_not_useful.append(not_useful) + search = driver.find_element_by_id("search-input") + search.send_keys(book) + search.send_keys(Keys.ENTER) - except: time.sleep(delay) - l += 1 + try: + close_notification = driver.find_element_by_class_name("opt-in-disallow-button") + close_notification.click() - try: - next_page = driver.find_element_by_link_text(str(l)) - next_page.click() + except: + pass - except: time.sleep(delay) - kisa = [len(review_texts), len(author_texts), len(date_texts), len(review_useful), len(review_not_useful)] - kisa = min(kisa) - kisa -= 1 - - review_texts_fin = review_texts[:kisa] - - df = pd.DataFrame({"Yorumlar": review_texts_fin}) + product = driver.find_element_by_class_name("name.ellipsis") + product.click() + time.sleep(delay) - if scrape_author: - author_texts_fin = author_texts[:kisa] - df["Müşteriler"] = author_texts_fin + except: + print("Kitap bulunamadı.") + sys.exit() - if scrape_date: - date_texts_fin = date_texts[:kisa] - df["İnceleme Tarihi"] = date_texts_fin + try: + reviewsTab = driver.find_element_by_class_name("pr__htabs-review-text") + reviewsTab.click() + time.sleep(delay) - if scrape_useful: - review_useful_fin = review_useful[:kisa] - review_not_useful_fin = review_not_useful[:kisa] - df["İncelemeyi Yararlı Bulan Kişi Sayısı"] = review_useful_fin - df["İncelemeyi Yararlı Bulmayan Kişi Sayısı"] = review_not_useful_fin + except: + print("Kitap incelemeleri bulunamadı.") + sys.exit() - df.to_excel(dosya_adi, header = True, index = False) + l = 1 + review_length = reviewsTab.text.replace("Yorumlar","") + + try: + review_length = review_length.replace(".","") + review_length = int(review_length) - x = "Çektiğiniz veriler "+ dosya_adi + " adlı excel dosyasına kaydedildi." - print(x) + except: + review_length = int(review_length) - print(""" - -------------------------------------------------------------------------- - - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - - - Github Hesabım: ardauzunoglu - - -------------------------------------------------------------------------- - """) + if review_length % 5 == 0: + review_length = review_length // 5 + else: + review_length = (review_length // 5) + 1 - time.sleep(3) + while l <= review_length: -if __name__ == "__main__": + lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") + match = False + + while match == False: + lastCount = lenOfPage + time.sleep(delay) + lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") + if lastCount == lenOfPage: + match = True + + print("Veriler çekiliyor...") + print("Sayfa: " + str(l)) + + try: + reviews = driver.find_elements_by_class_name("review-text") + for review in reviews: + review = review.text + review_texts.append(review) + + authors = driver.find_elements_by_xpath("//a[@class ='alt']//span[@itemprop='name']") + for author in authors: + author = author.text + author_texts.append(author) + + dates = driver.find_elements_by_class_name("review-date") + for date in dates: + date = date.text + date_texts.append(date) + + usefuls = driver.find_elements_by_xpath("//div[@class ='agree']//span[@class='count']") + for useful in usefuls: + useful = useful.text + review_useful.append(useful) + + not_usefuls = driver.find_elements_by_xpath("//div[@class ='disagree']//span[@class='count']") + for not_useful in not_usefuls: + not_useful = not_useful.text + review_not_useful.append(not_useful) + + except: + time.sleep(delay) + + l += 1 + + try: + next_page = driver.find_element_by_link_text(str(l)) + next_page.click() + + except: + time.sleep(delay) + + driver.close() + + length_list = [review_texts, review_useful, review_not_useful, author_texts, date_texts] + limit = map(len, length_list) + limit = min(list(limit)) + limit -= 1 + + review_texts_fin = review_texts[:limit] + + df = pd.DataFrame({"Yorumlar": review_texts_fin}) + + if scrape_author: + author_texts_fin = author_texts[:limit] + df["Müşteriler"] = author_texts_fin + + if scrape_date: + date_texts_fin = date_texts[:limit] + df["İnceleme Tarihi"] = date_texts_fin + + if scrape_useful: + review_useful_fin = review_useful[:limit] + review_not_useful_fin = review_not_useful[:limit] + df["İncelemeyi Yararlı Bulan Kişi Sayısı"] = review_useful_fin + df["İncelemeyi Yararlı Bulmayan Kişi Sayısı"] = review_not_useful_fin + + df.to_excel(file, header = True, index = False) + + x = "Çektiğiniz veriler "+ file + " adlı excel dosyasına kaydedildi." + print(x) + + print(""" + -------------------------------------------------------------------------- + - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - + - Github Hesabım: ardauzunoglu - + -------------------------------------------------------------------------- + """) + + time.sleep(3) initialize() scrape() + +if __name__ == "__main__": + kitapyurdu_scrape() \ No newline at end of file diff --git a/scrapers/main-scraper.py b/scrapers/main-scraper.py index 740119c..718f5c8 100644 --- a/scrapers/main-scraper.py +++ b/scrapers/main-scraper.py @@ -1,23 +1,27 @@ import beyazperde_scraper as bp +from beyazperde_scraper import beyazperde_scrape import eksi_scraper as es +from eksi_scraper import eksisozluk_scrape import hepsiburada_scraper as hb +from hepsiburada_scraper import hepsiburada_scrape import kitapyurdu_scraper as ky +from kitapyurdu_scraper import kitapyurdu_scrape import trendyol_scraper as ty +from trendyol_scraper import trendyol_scrape import yemeksepeti_scraper as ys +from yemeksepeti_scraper import yemeksepeti_scrape import youtube_scraper as yt +from youtube_scraper import youtube_scrape -secenekler = {"beyazperde":bp, "ekşi sözlük":es, "hepsiburada":hb, "kitapyurdu":ky, "trendyol":ty, "yemeksepeti":ys, "youtube":yt} +choices = ["beyazperde", "ekşi sözlük", "hepsiburada", "kitapyurdu", "trendyol", "yemeksepeti", "youtube"] +libs = {"beyazperde":beyazperde_scrape, "ekşi sözlük":eksisozluk_scrape, "hepsiburada":hepsiburada_scrape, "kitapyurdu":kitapyurdu_scrape, "trendyol":trendyol_scrape, "yemeksepeti":yemeksepeti_scrape, "youtube":youtube_scrape} -secenek = input("Kullanacağınız scraper: ") -secenek = secenek.lower() - -if secenek in secenekler: - - secenek = secenekler[secenek] - secenek.initialize() - secenek.scrape() +choice = input("Kullanacağınız scraper: ") +choice = choice.lower() +if choice in choices: + scraper = libs[choice] + scraper() else: print("Geçersiz yanıt.") - secenek = input("Kullanacağınız scraper: ") - print("\n") \ No newline at end of file + secenek = input("Kullanacağınız scraper: ") \ No newline at end of file diff --git a/scrapers/trendyol_scraper.py b/scrapers/trendyol_scraper.py index bbe575c..b2644d7 100644 --- a/scrapers/trendyol_scraper.py +++ b/scrapers/trendyol_scraper.py @@ -4,202 +4,179 @@ from selenium import webdriver from selenium.webdriver.common.keys import Keys -def initialize(): - - print(""" - --------------------------------------------------------- - - Trendyol Scraper'a hoş geldiniz! - - - Geliştirici: Arda Uzunoğlu - - --------------------------------------------------------- - """) - - - global urun_adi, dosya_adi, delay, review_texts, review_useful, customer_name_texts, date_texts, scrape_useful, scrape_customer_name, scrape_date, path - - urun_adi = input("Yorumların çekileceği ürün adı: ") - dosya_adi = input("Oluşturulacak Excel dosyasının adı: ") - dosya_adi = dosya_adi + ".xlsx" - delay = int(input("Bekleme süresi: ")) - - review_texts = [] - review_useful = [] - customer_name_texts = [] - date_texts = [] - - scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") - while (scrape_useful_input.lower() != "y") or (scrape_useful_input.lower() != "n"): - if scrape_useful_input.lower() == "y": - scrape_useful = True - break - - elif scrape_useful_input.lower() == "n": - scrape_useful = False - break - - else: - print("Geçersiz yanıt.") - scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") - print("\n") - - scrape_customer_name_input = input("Müşteri isimleri çekilsin mi(y/n): ") - while (scrape_customer_name_input.lower() != "y") or (scrape_customer_name_input.lower() != "n"): - if scrape_customer_name_input.lower() == "y": - scrape_customer_name = True - break - - elif scrape_customer_name_input.lower() == "n": - scrape_customer_name = False - break - - else: - print("Geçersiz yanıt.") - scrape_customer_name_input = input("Müşteri isimleri çekilsin mi(y/n): ") - print("\n") - - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - while (scrape_date_input.lower() != "y") or (scrape_date_input.lower() != "n"): - if scrape_date_input.lower() == "y": - scrape_date = True - break - - elif scrape_date_input.lower() == "n": - scrape_date = False - break - - else: - print("Geçersiz yanıt.") - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - print("\n") - - scrape_useful = True - scrape_customer_name = True - scrape_date = True - - path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - -def scrape(): - try: - driver = webdriver.Chrome(path) - time.sleep(1) - - except: - print("Chromedriver kullanılamıyor.") - sys.exit() - - try: - driver.get("https://www.trendyol.com") - time.sleep(1) - driver.maximize_window() - time.sleep(1) - - except: - print("Trendyola'a erişilemiyor.") - sys.exit() - - try: - arama_bari = driver.find_element_by_class_name("search-box") - arama_bari.send_keys(urun_adi) - arama_bari.send_keys(Keys.ENTER) - time.sleep(1) - - urun = driver.find_element_by_class_name("prdct-desc-cntnr") - urun.click() - time.sleep(1) - - except: - print("Ürün bulunamadı.") - sys.exit() - - url = driver.current_url - index_of_question_mark = url.index("?") - url = url[:index_of_question_mark] - url = url + "/yorumlar" - driver.get(url) - - yorum_sayisi = driver.find_element_by_class_name("pr-rnr-sm-p-s") - yorum_sayisi = yorum_sayisi.text - yorum_sayisi = yorum_sayisi.replace("Değerlendirme", "") - yorum_sayisi = yorum_sayisi.replace("Yorum", "") - yorum_sayisi = yorum_sayisi.split() - yorum_sayisi = int(yorum_sayisi[1]) - - while len(review_texts) < yorum_sayisi: - - lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") - match = False - - while match == False: - lastCount = lenOfPage +def trendyol_scrape(): + def initialize(): + def preference(scrape_input): + while (scrape_input.lower() != "y") or (scrape_input.lower() != "n"): + if scrape_input.lower() == "y": + output = True + break + + elif scrape_input.lower() == "n": + output = False + break + + else: + print("Geçersiz yanıt.") + scrape_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + + return output + print(""" + --------------------------------------------------------- + - Trendyol Scraper'a hoş geldiniz! - + - Geliştirici: Arda Uzunoğlu - + --------------------------------------------------------- + """) + + + global product_name, file, delay, review_texts, review_useful, customer_name_texts, date_texts, scrape_useful, scrape_customer_name, scrape_date, path + + product_name = input("Yorumların çekileceği ürün adı: ") + file = input("Oluşturulacak Excel dosyasının adı: ") + file = file + ".xlsx" + delay = int(input("Bekleme süresi: ")) + + review_texts = [] + review_useful = [] + customer_name_texts = [] + date_texts = [] + + scrape_useful_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + scrape_useful = preference(scrape_useful_input) + + scrape_customer_name_input = input("Müşteri isimleri çekilsin mi(y/n): ") + scrape_customer_name = preference(scrape_customer_name_input) + + scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") + scrape_date = preference(scrape_date_input) + + path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" + + def scrape(): + try: + driver = webdriver.Chrome(path) + time.sleep(1) + + except: + print("Chromedriver kullanılamıyor.") + sys.exit() + + try: + driver.get("https://www.trendyol.com") time.sleep(1) + driver.maximize_window() + time.sleep(1) + + except: + print("Trendyola'a erişilemiyor.") + sys.exit() + + try: + arama_bari = driver.find_element_by_class_name("search-box") + arama_bari.send_keys(product_name) + arama_bari.send_keys(Keys.ENTER) + time.sleep(1) + + urun = driver.find_element_by_class_name("prdct-desc-cntnr") + urun.click() + time.sleep(1) + + except: + print("Ürün bulunamadı.") + sys.exit() + + url = driver.current_url + index_of_question_mark = url.index("?") + url = url[:index_of_question_mark] + url = url + "/yorumlar" + driver.get(url) + + yorum_sayisi = driver.find_element_by_class_name("pr-rnr-sm-p-s") + yorum_sayisi = yorum_sayisi.text + yorum_sayisi = yorum_sayisi.replace("Değerlendirme", "") + yorum_sayisi = yorum_sayisi.replace("Yorum", "") + yorum_sayisi = yorum_sayisi.split() + yorum_sayisi = int(yorum_sayisi[1]) + + while len(review_texts) < yorum_sayisi: + lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") - if lastCount == lenOfPage: - match = True + match = False - time.sleep(1) + while match == False: + lastCount = lenOfPage + time.sleep(1) + lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") + if lastCount == lenOfPage: + match = True - reviews = driver.find_elements_by_class_name("rnr-com-tx") - for review in reviews: - review = review.text - review_texts.append(review) + time.sleep(1) - print("Veriler çekiliyor...") - print("İnceleme: " + str(len(review_texts))) + reviews = driver.find_elements_by_class_name("rnr-com-tx") + for review in reviews: + review = review.text + review_texts.append(review) - usefuls = driver.find_elements_by_xpath("//*[@class='tooltip-wrp']//span[2]") - for useful in usefuls: - useful = useful.text - useful = useful.strip("()") - review_useful.append(useful) + print("Veriler çekiliyor...") + print("İnceleme: " + str(len(review_texts))) - customers = driver.find_elements_by_xpath("//*[@class='rnr-com-bt']//span[@class = 'rnr-com-usr']") - for customer in customers: - customer = customer.text - customer = customer.replace("|","") - customer = customer.split() + usefuls = driver.find_elements_by_xpath("//*[@class='tooltip-wrp']//span[2]") + for useful in usefuls: + useful = useful.text + useful = useful.strip("()") + review_useful.append(useful) - customer_name = customer[-3:] - customer_name = " ".join(customer_name) - customer_name_texts.append(customer_name) + customers = driver.find_elements_by_xpath("//*[@class='rnr-com-bt']//span[@class = 'rnr-com-usr']") + for customer in customers: + customer = customer.text + customer = customer.replace("|","") + customer = customer.split() - date = customer[:-3] - date = " ".join(date) - date_texts.append(date) + customer_name = customer[-3:] + customer_name = " ".join(customer_name) + customer_name_texts.append(customer_name) - driver.close() + date = customer[:-3] + date = " ".join(date) + date_texts.append(date) - kisa = [len(review_texts), len(customer_name_texts), len(date_texts), len(review_useful)] - kisa = min(kisa) - kisa -= 1 + driver.close() - review_texts_fin = review_texts[:kisa] - df = pd.DataFrame({"Yorum": review_texts_fin}) + length_list = [review_texts, review_useful, customer_name_texts, date_texts] + limit = map(len, length_list) + limit = min(list(limit)) + limit -= 1 - if scrape_useful: - review_useful_fin = review_useful[:kisa] - df["Yorum Beğeni Sayısı"] = review_useful_fin + review_texts_fin = review_texts[:limit] + df = pd.DataFrame({"Yorum": review_texts_fin}) - if scrape_customer_name: - customer_name_texts_fin = customer_name_texts[:kisa] - df["Yorum Yazan Müşteri"] = customer_name_texts_fin + if scrape_useful: + review_useful_fin = review_useful[:limit] + df["Yorum Beğeni Sayısı"] = review_useful_fin - if scrape_date: - date_texts_fin = date_texts[:kisa] - df["Yorumun Yazıldığı Tarih"] = date_texts_fin + if scrape_customer_name: + customer_name_texts_fin = customer_name_texts[:limit] + df["Yorum Yazan Müşteri"] = customer_name_texts_fin - df.to_excel(dosya_adi, header = True, index = False) + if scrape_date: + date_texts_fin = date_texts[:limit] + df["Yorumun Yazıldığı Tarih"] = date_texts_fin - x = "Çektiğiniz veriler "+ dosya_adi + " adlı excel dosyasına kaydedildi." - print(x) + df.to_excel(file, header = True, index = False) - print(""" - -------------------------------------------------------------------------- - - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - - - Github Hesabım: ardauzunoglu - - -------------------------------------------------------------------------- - """) + x = "Çektiğiniz veriler "+ file + " adlı excel dosyasına kaydedildi." + print(x) - time.sleep(3) + print(""" + -------------------------------------------------------------------------- + - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - + - Github Hesabım: ardauzunoglu - + -------------------------------------------------------------------------- + """) -if __name__ == "__main__": + time.sleep(3) initialize() - scrape() \ No newline at end of file + scrape() + +if __name__ == "__main__": + trendyol_scrape() \ No newline at end of file diff --git a/scrapers/yemeksepeti_scraper.py b/scrapers/yemeksepeti_scraper.py index c7eabd2..28aef50 100644 --- a/scrapers/yemeksepeti_scraper.py +++ b/scrapers/yemeksepeti_scraper.py @@ -4,332 +4,289 @@ from selenium import webdriver from selenium.webdriver.common.keys import Keys -def initialize(): - - print(""" - --------------------------------------------------------- - - Yemeksepeti Scraper'a hoş geldiniz! - - - Geliştirici: Arda Uzunoğlu - - --------------------------------------------------------- - """) - - global restoran_info, username_info, password_info, sehir, dosya_adi, delay, yorum_texts, author_texts, date_texts, speed_ratings, service_ratings, flavour_ratings, scrape_author, scrape_date, scrape_speed, scrape_service, scrape_flavour, path - - restoran_info = input("Yorumların Çekileceği Restoran: ") - username_info = input("Yemeksepeti kullanıcı adı: ") - password_info = input("Yemeksepeti parola: ") - password_info = "ardaardaarda1" - sehir = input("Yemeksepeti Şehir: ") - dosya_adi = input("Oluşturulacak Excel dosyasının adı: ") - dosya_adi = dosya_adi + ".xlsx" - delay = int(input("Bekleme süresi: ")) - - yorum_texts = [] - author_texts = [] - date_texts = [] - speed_ratings = [] - service_ratings = [] - flavour_ratings = [] - - scrape_author_input = input("Müşteri isimleri çekilsin mi(y/n): ") - while (scrape_author_input.lower() != "y") or (scrape_author_input.lower() != "n"): - if scrape_author_input.lower() == "y": - scrape_author = True - break - - elif scrape_author_input.lower() == "n": - scrape_author = False - break +def yemeksepeti_scrape(): + def initialize(): + def preference(scrape_input): + while (scrape_input.lower() != "y") or (scrape_input.lower() != "n"): + if scrape_input.lower() == "y": + output = True + break + + elif scrape_input.lower() == "n": + output = False + break + + else: + print("Geçersiz yanıt.") + scrape_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + + return output + print(""" + --------------------------------------------------------- + - Yemeksepeti Scraper'a hoş geldiniz! - + - Geliştirici: Arda Uzunoğlu - + --------------------------------------------------------- + """) + + global restaurant_info, username_info, password_info, city_info, file, delay, review_texts, author_texts, date_texts, speed_ratings, service_ratings, flavour_ratings, scrape_author, scrape_date, scrape_speed, scrape_service, scrape_flavour, path + + restaurant_info = input("Yorumların Çekileceği Restoran: ") + username_info = input("Yemeksepeti kullanıcı adı: ") + password_info = input("Yemeksepeti parola: ") + password_info = "ardaardaarda1" + city_info = input("Yemeksepeti Şehir: ") + file = input("Oluşturulacak Excel dosyasının adı: ") + file = file + ".xlsx" + delay = int(input("Bekleme süresi: ")) + + review_texts = [] + author_texts = [] + date_texts = [] + speed_ratings = [] + service_ratings = [] + flavour_ratings = [] + + scrape_author_input = input("Müşteri isimleri çekilsin mi(y/n): ") + scrape_author = preference(scrape_author_input) + + scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") + scrape_date = preference(scrape_date_input) + + scrape_speed_input = input("İncelemedeki hız puanı çekilsin mi(y/n): ") + scrape_speed = preference(scrape_speed_input) + + scrape_service_input = input("İncelemedeki servis puanı çekilsin mi(y/n): ") + scrape_service = preference(scrape_service_input) + + scrape_flavour_input = input("İncelemedeki lezzet puanı çekilsin mi(y/n): ") + scrape_flavour = preference(scrape_flavour_input) + + path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" + + tr_chars = ["ğ", "ş", "ı", "ü", "ö", "ç"] + tr2eng = { + "ğ":"g", + "ş":"s", + "ı":"i", + "ü":"u", + "ö":"o", + "ç":"c" + } + + city_info = city_info.lower() + for harf in city_info: + if harf in tr_chars: + city_info = city_info.replace(harf, tr2eng[harf]) - else: - print("Geçersiz yanıt.") - scrape_author_input = input("Müşteri isimleri çekilsin mi(y/n): ") - print("\n") + else: + pass - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - while (scrape_date_input.lower() != "y") or (scrape_date_input.lower() != "n"): - if scrape_date_input.lower() == "y": - scrape_date = True - break + def scrape(): + try: + driver = webdriver.Chrome(path) + time.sleep(delay) - elif scrape_date_input.lower() == "n": - scrape_date = False - break + except: + print("Chromedriver kullanılamıyor.") + sys.exit() - else: - print("Geçersiz yanıt.") - scrape_date_input = input("İnceleme tarihleri çekilsin mi(y/n): ") - print("\n") + try: + url = "https://www.yemeksepeti.com/" + city_info + driver.get(url) + driver.maximize_window() + time.sleep(delay) - scrape_speed_input = input("İncelemedeki hız puanı çekilsin mi(y/n): ") - while (scrape_speed_input.lower() != "y") or (scrape_speed_input.lower() != "n"): - if scrape_speed_input.lower() == "y": - scrape_speed = True - break + except: + print("Yemeksepeti'ne ulaşılamıyor.") + sys.exit() - elif scrape_speed_input.lower() == "n": - scrape_speed = False - break + try: + username = driver.find_element_by_id("UserName") + username.send_keys(username_info) - else: - print("Geçersiz yanıt.") - scrape_speed_input = input("İncelemedeki hız puanı çekilsin mi(y/n): ") - print("\n") + password = driver.find_element_by_id("password") + password.send_keys(password_info) + password.send_keys(Keys.ENTER) + time.sleep(delay) - scrape_service_input = input("İncelemedeki servis puanı çekilsin mi(y/n): ") - while (scrape_service_input.lower() != "y") or (scrape_service_input.lower() != "n"): - if scrape_service_input.lower() == "y": - scrape_service = True - break + except: + print("Kullanıcı adı ve/veya parola hatalı.") + sys.exit() - elif scrape_service_input.lower() == "n": - scrape_service = False - break + try: + address_area = driver.find_element_by_class_name("address-area") + address_area.click() + time.sleep(delay) - else: - print("Geçersiz yanıt.") - scrape_service_input = input("İncelemedeki servis puanı çekilsin mi(y/n): ") - print("\n") + except: + print("Kayıtlı adres bulunamadı.") + sys.exit() - scrape_flavour_input = input("İncelemedeki lezzet puanı çekilsin mi(y/n): ") - while (scrape_flavour_input.lower() != "y") or (scrape_flavour_input.lower() != "n"): - if scrape_flavour_input.lower() == "y": - scrape_flavour = True - break + try: + search_box = driver.find_element_by_class_name("search-box") + search_box.send_keys(restaurant_info) + search_box.send_keys(Keys.ENTER) + time.sleep(delay+3) - elif scrape_flavour_input.lower() == "n": - scrape_flavour = False - break + restoran = driver.find_element_by_class_name("restaurantName") + restoran.click() + time.sleep(delay) - else: - print("Geçersiz yanıt.") - scrape_flavour_input = input("İnceleme lezzet puanı çekilsin mi(y/n): ") - print("\n") - - path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - - tr_chars = ["ğ", "ş", "ı", "ü", "ö", "ç"] - tr2eng = { - "ğ":"g", - "ş":"s", - "ı":"i", - "ü":"u", - "ö":"o", - "ç":"c" - } - - sehir = sehir.lower() - for harf in sehir: - if harf in tr_chars: - sehir = sehir.replace(harf, tr2eng[harf]) + except: + print("Restoran bulunamadı.") + sys.exit() - else: - pass - -def scrape(): - try: - driver = webdriver.Chrome(path) - time.sleep(delay) - - except: - print("Chromedriver kullanılamıyor.") - sys.exit() - - try: - url = "https://www.yemeksepeti.com/" + sehir - driver.get(url) - driver.maximize_window() - time.sleep(delay) - - except: - print("Yemeksepeti'ne ulaşılamıyor.") - sys.exit() - - try: - username = driver.find_element_by_id("UserName") - username.send_keys(username_info) - - password = driver.find_element_by_id("password") - password.send_keys(password_info) - password.send_keys(Keys.ENTER) - time.sleep(delay) - - except: - print("Kullanıcı adı ve/veya parola hatalı.") - sys.exit() - - try: - address_area = driver.find_element_by_class_name("address-area") - address_area.click() - time.sleep(delay) - - except: - print("Kayıtlı adres bulunamadı.") - sys.exit() - - try: - search_box = driver.find_element_by_class_name("search-box") - search_box.send_keys(restoran_info) - search_box.send_keys(Keys.ENTER) - time.sleep(delay+3) - - restoran = driver.find_element_by_class_name("restaurantName") - restoran.click() - time.sleep(delay) - - except: - print("Restoran bulunamadı.") - sys.exit() - - try: - yorumlar_section = driver.find_element_by_xpath("//*[@id='restaurantDetail']/div[2]/div[1]/ul/li[4]/a") - yorumlar_section.click() - time.sleep(delay) - - except: - print("Yorum bulunamadı.") - sys.exit() - - l = 1 - yorum_uzunlugu = yorumlar_section.text - yorum_uzunlugu = yorum_uzunlugu.replace("Yorumlar", "") - yorum_uzunlugu = yorum_uzunlugu.replace("(","") - yorum_uzunlugu = yorum_uzunlugu.replace(")","") - yorum_uzunlugu = int(yorum_uzunlugu) - - if yorum_uzunlugu % 30 == 0: - yorum_uzunlugu = yorum_uzunlugu // 30 - - else: - yorum_uzunlugu = (yorum_uzunlugu // 30) + 1 - - while l < yorum_uzunlugu: - - lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") - match = False - - while match == False: - lastCount = lenOfPage + try: + yorumlar_section = driver.find_element_by_xpath("//*[@id='restaurantDetail']/div[2]/div[1]/ul/li[4]/a") + yorumlar_section.click() time.sleep(delay) - lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") - if lastCount == lenOfPage: - match = True - print("Veriler çekiliyor...") - print("Sayfa: " + str(l)) + except: + print("Yorum bulunamadı.") + sys.exit() - yorumlar = driver.find_elements_by_class_name("comment.row") - for yorum in yorumlar: - yorum = yorum.text - yorum = yorum.replace("\n", " ") - yorum = yorum.split() + l = 1 + yorum_uzunlugu = yorumlar_section.text + yorum_uzunlugu = yorum_uzunlugu.replace("Yorumlar", "") + yorum_uzunlugu = yorum_uzunlugu.replace("(","") + yorum_uzunlugu = yorum_uzunlugu.replace(")","") + yorum_uzunlugu = int(yorum_uzunlugu) - if "." in yorum[0]: - yorum = yorum[1:] + if yorum_uzunlugu % 30 == 0: + yorum_uzunlugu = yorum_uzunlugu // 30 - else: - pass - - yorum = " ".join(yorum) - yorum_texts.append(yorum) - - authors = driver.find_elements_by_class_name("userName") - for author in authors: - author = author.text - author_texts.append(author) - - dates = driver.find_elements_by_class_name("commentDate") - for date in dates: - date = date.text - date_texts.append(date) - - speeds = driver.find_elements_by_class_name("speed") - for speed in speeds: - speed = speed.text - speed = speed.replace("Hız: ", "") - speed_ratings.append(speed) - - services = driver.find_elements_by_class_name("serving") - for service in services: - service = service.text - service = service.replace("Servis: ", "") - service_ratings.append(service) - - flavours = driver.find_elements_by_class_name("flavour") - for flavour in flavours: - flavour = flavour.text - flavour = flavour.replace("Lezzet: ", "") - flavour_ratings.append(flavour) - - l += 1 - next_page = driver.find_element_by_link_text(str(l)) - next_page.click() - - driver.close() - - def duplicates(lst, item): - return [i for i, x in enumerate(lst) if x == item] - - if "Restoran Cevabı" in author_texts: - girilecek_rating_indexleri = duplicates(author_texts,"Restoran Cevabı") - for i in girilecek_rating_indexleri: - date_texts.insert(i, "Restoran Cevabı") - speed_ratings.insert(i, "Restoran Cevabı") - service_ratings.insert(i, "Restoran Cevabı") - flavour_ratings.insert(i, "Restoran Cevabı") - - elif "Yemeksepeti" in author_texts: - girilecek_rating_indexleri = duplicates(author_texts,"Yemeksepeti") - for i in girilecek_rating_indexleri: - date_texts.insert(i, "Yemeksepeti") - speed_ratings.insert(i, "Yemeksepeti") - service_ratings.insert(i, "Yemeksepeti") - flavour_ratings.insert(i, "Yemeksepeti") - - else: - pass + else: + yorum_uzunlugu = (yorum_uzunlugu // 30) + 1 - kisa_liste = [len(yorum_texts), len(author_texts), len(date_texts), len(speed_ratings), len(service_ratings), len(flavour_ratings)] - kisa = min(kisa_liste) - kisa -= 1 + while l < yorum_uzunlugu: - yorum_texts_fin = yorum_texts[:kisa] - df = pd.DataFrame({"Yorumlar": yorum_texts_fin}) + lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") + match = False + + while match == False: + lastCount = lenOfPage + time.sleep(delay) + lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") + if lastCount == lenOfPage: + match = True + + print("Veriler çekiliyor...") + print("Sayfa: " + str(l)) + + yorumlar = driver.find_elements_by_class_name("comment.row") + for yorum in yorumlar: + yorum = yorum.text + yorum = yorum.replace("\n", " ") + yorum = yorum.split() + + if "." in yorum[0]: + yorum = yorum[1:] + + else: + pass + + yorum = " ".join(yorum) + review_texts.append(yorum) + + authors = driver.find_elements_by_class_name("userName") + for author in authors: + author = author.text + author_texts.append(author) + + dates = driver.find_elements_by_class_name("commentDate") + for date in dates: + date = date.text + date_texts.append(date) + + speeds = driver.find_elements_by_class_name("speed") + for speed in speeds: + speed = speed.text + speed = speed.replace("Hız: ", "") + speed_ratings.append(speed) + + services = driver.find_elements_by_class_name("serving") + for service in services: + service = service.text + service = service.replace("Servis: ", "") + service_ratings.append(service) + + flavours = driver.find_elements_by_class_name("flavour") + for flavour in flavours: + flavour = flavour.text + flavour = flavour.replace("Lezzet: ", "") + flavour_ratings.append(flavour) + + l += 1 + next_page = driver.find_element_by_link_text(str(l)) + next_page.click() + + driver.close() + + def duplicates(lst, item): + return [i for i, x in enumerate(lst) if x == item] + + if "Restoran Cevabı" in author_texts: + girilecek_rating_indexleri = duplicates(author_texts,"Restoran Cevabı") + for i in girilecek_rating_indexleri: + date_texts.insert(i, "Restoran Cevabı") + speed_ratings.insert(i, "Restoran Cevabı") + service_ratings.insert(i, "Restoran Cevabı") + flavour_ratings.insert(i, "Restoran Cevabı") + + elif "Yemeksepeti" in author_texts: + girilecek_rating_indexleri = duplicates(author_texts,"Yemeksepeti") + for i in girilecek_rating_indexleri: + date_texts.insert(i, "Yemeksepeti") + speed_ratings.insert(i, "Yemeksepeti") + service_ratings.insert(i, "Yemeksepeti") + flavour_ratings.insert(i, "Yemeksepeti") + + else: + pass - if scrape_author: - author_texts_fin = author_texts[:kisa] - df["Müşteriler"] = author_texts_fin + length_list = [review_texts, author_texts, date_texts, speed_ratings, service_ratings, flavour_ratings] + limit = map(len, length_list) + limit = min(list(limit)) + limit -= 1 - if scrape_date: - date_texts_fin = date_texts[:kisa] - df["Yorum Tarihi"] = date_texts_fin + review_texts = review_texts[:limit] + df = pd.DataFrame({"Yorumlar": review_texts}) - if scrape_speed: - speed_ratings_fin = speed_ratings[:kisa] - df["Hız Değerlendirmesi"] = speed_ratings_fin + if scrape_author: + author_texts_fin = author_texts[:limit] + df["Müşteriler"] = author_texts_fin - if scrape_service: - service_ratings_fin = service_ratings[:kisa] - df["Servis Değerlendirmesi"] = service_ratings_fin + if scrape_date: + date_texts_fin = date_texts[:limit] + df["Yorum Tarihi"] = date_texts_fin - if scrape_flavour: - flavour_ratings_fin = flavour_ratings[:kisa] - df["Lezzet Değerlendirmesi"] = flavour_ratings_fin + if scrape_speed: + speed_ratings_fin = speed_ratings[:limit] + df["Hız Değerlendirmesi"] = speed_ratings_fin - df.to_excel(dosya_adi, header = True, index = False) + if scrape_service: + service_ratings_fin = service_ratings[:limit] + df["Servis Değerlendirmesi"] = service_ratings_fin - x = "Çektiğiniz veriler "+ dosya_adi + " adlı excel dosyasına kaydedildi." - print(x) + if scrape_flavour: + flavour_ratings_fin = flavour_ratings[:limit] + df["Lezzet Değerlendirmesi"] = flavour_ratings_fin - print(""" - -------------------------------------------------------------------------- - - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - - - Github Hesabım: ardauzunoglu - - -------------------------------------------------------------------------- - """) + df.to_excel(file, header = True, index = False) - time.sleep(3) + x = "Çektiğiniz veriler "+ file + " adlı excel dosyasına kaydedildi." + print(x) -if __name__ == "__main__": + print(""" + -------------------------------------------------------------------------- + - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - + - Github Hesabım: ardauzunoglu - + -------------------------------------------------------------------------- + """) + + time.sleep(3) initialize() - scrape() \ No newline at end of file + scrape() + +if __name__ == "__main__": + yemeksepeti_scrape() \ No newline at end of file diff --git a/scrapers/youtube_scraper.py b/scrapers/youtube_scraper.py index 226c703..1f347c5 100644 --- a/scrapers/youtube_scraper.py +++ b/scrapers/youtube_scraper.py @@ -4,191 +4,160 @@ from selenium import webdriver from selenium.webdriver.common.keys import Keys -def initialize(): - - print(""" - --------------------------------------------------------- - - Youtube Scraper'a hoş geldiniz! - - - Geliştirici: Arda Uzunoğlu - - --------------------------------------------------------- - """) +def youtube_scrape(): + def initialize(): + def preference(scrape_input): + while (scrape_input.lower() != "y") or (scrape_input.lower() != "n"): + if scrape_input.lower() == "y": + output = True + break + + elif scrape_input.lower() == "n": + output = False + break + + else: + print("Geçersiz yanıt.") + scrape_input = input("İncelemenin aldığı beğeni sayısı çekilsin mi(y/n): ") + + return output + print(""" + --------------------------------------------------------- + - Youtube Scraper'a hoş geldiniz! - + - Geliştirici: Arda Uzunoğlu - + --------------------------------------------------------- + """) + + global url, file, delay, scrape_author, scrape_date, scrape_title, scrape_like, path + + url = input("Yorumların çekileceği Youtube videosunun bağlantısı: ") + file = input("Oluşturulacak Excel dosyasının adı: ") + file = file + ".xlsx" + delay = int(input("Bekleme süresi: ")) + + scrape_author_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") + scrape_author = preference(scrape_author_input) + + scrape_date_input = input("Yorum tarihleri çekilsin mi(y/n): ") + scrape_date = preference(scrape_date_input) + + scrape_title_input = input("Video başlığı çekilsin mi(y/n): ") + scrape_title = preference(scrape_title_input) - global url, dosya_adi, delay, scrape_authors, scrape_dates, scrape_title, scrape_likes, path - - url = input("Yorumların çekileceği Youtube videosunun bağlantısı: ") - dosya_adi = input("Oluşturulacak Excel dosyasının adı: ") - dosya_adi = dosya_adi + ".xlsx" - delay = int(input("Bekleme süresi: ")) - - scrape_author_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") - while (scrape_author_input.lower() != "y") or (scrape_author_input.lower() != "n"): - if scrape_author_input.lower() == "y": - scrape_authors = True - break - - elif scrape_author_input.lower() == "n": - scrape_authors = False - break - - else: - print("Geçersiz yanıt.") - scrape_author_input = input("Kullanıcı isimleri çekilsin mi(y/n): ") - print("\n") - - scrape_dates_input = input("Yorum tarihleri çekilsin mi(y/n): ") - while (scrape_dates_input.lower() != "y") or (scrape_dates_input.lower() != "n"): - if scrape_dates_input.lower() == "y": - scrape_dates = True - break - - elif scrape_dates_input.lower() == "n": - scrape_dates = False - break - - else: - print("Geçersiz yanıt.") - scrape_dates_input = input("Yorum tarihleri çekilsin mi(y/n): ") - print("\n") - - scrape_title_input = input("Video başlığı çekilsin mi(y/n): ") - while (scrape_title_input.lower() != "y") or (scrape_title_input.lower() != "n"): - if scrape_title_input.lower() == "y": - scrape_title = True - break - - elif scrape_title_input.lower() == "n": - scrape_title = False - break - - else: - print("Geçersiz yanıt.") - scrape_title_input = input("Video başlığı çekilsin mi(y/n): ") - print("\n") - - scrape_likes_input = input("Yorumun aldığı beğeni sayısı çekilsin mi(y/n): ") - while (scrape_likes_input.lower() != "y") or (scrape_likes_input.lower() != "n"): - if scrape_likes_input.lower() == "y": - scrape_likes = True - break - - elif scrape_likes_input.lower() == "n": - scrape_likes = False - break - - else: - print("Geçersiz yanıt.") - scrape_likes_input = input("Yorumun aldığı beğeni sayısı çekilsin mi(y/n): ") - print("\n") - - path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - -def scrape(): - comment_texts = [] - author_texts = [] - date_texts = [] - title_text = [] - like_texts = [] - - try: - driver = webdriver.Chrome(path) - time.sleep(delay) - - except: - print("Chromedriver kullanılamıyor.") - sys.exit() - - try: - driver.get(url) - time.sleep(delay) - driver.maximize_window() - time.sleep(delay) - - except: - print("Youtube'a erişilemiyor.") - sys.exit() - - time.sleep(delay+2) - comment_section = driver.find_element_by_xpath('//*[@id="comments"]') - title = driver.find_element_by_class_name("title").text - time.sleep(delay) - - driver.execute_script("arguments[0].scrollIntoView();", comment_section) - time.sleep(delay+2) + scrape_like_input = input("Yorumun aldığı beğeni sayısı çekilsin mi(y/n): ") + scrape_like = preference(scrape_like_input) - comment_count = driver.find_element_by_class_name("count-text.ytd-comments-header-renderer").text - comment_count = comment_count.replace(" Yorum","") - comment_count = comment_count.replace(".","") - comment_count = int(comment_count) + path = "BURAYA CHROMEDRIVER KONUMUNU GİRİNİZ" - last_height = driver.execute_script("return document.documentElement.scrollHeight") + def scrape(): + comment_texts = [] + author_texts = [] + date_texts = [] + title_text = [] + like_texts = [] - while True: - driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);") - new_height = driver.execute_script("return document.documentElement.scrollHeight") - if new_height == last_height: - break - last_height = new_height - - l = 1 - while l <= comment_count: try: - comments = driver.find_elements_by_xpath("//*[@id='contents']/ytd-comment-thread-renderer") - comment = comments[l-1] - print("Veri çekiliyor...") - print("Yorum: " + str(l)) - author = comment.find_element_by_id("author-text").text - date = comment.find_element_by_class_name("published-time-text").text - comment_text = comment.find_element_by_id("content-text").text - likes = comment.find_element_by_id("vote-count-middle").text - - author_texts.append(author) - date_texts.append(date) - comment_texts.append(comment_text) - like_texts.append(likes) - title_text.append(title) + driver = webdriver.Chrome(path) + time.sleep(delay) except: - break - - l += 1 + print("Chromedriver kullanılamıyor.") + sys.exit() - driver.close() + try: + driver.get(url) + time.sleep(delay) + driver.maximize_window() + time.sleep(delay) - kisa = [len(comment_texts), len(author_texts), len(date_texts), len(like_texts), len(title_text)] - kisa = min(kisa) - kisa -= 1 + except: + print("Youtube'a erişilemiyor.") + sys.exit() - comment_texts_fin = comment_texts[:kisa] - df = pd.DataFrame({"Yorumlar":comment_texts_fin}) - if scrape_authors: - author_texts_fin = author_texts[:kisa] - df["Kullanıcı"] = author_texts_fin + time.sleep(delay+2) + comment_section = driver.find_element_by_xpath('//*[@id="comments"]') + title = driver.find_element_by_class_name("title").text + time.sleep(delay) - if scrape_dates: - date_texts_fin = date_texts[:kisa] - df["Yorum Tarihi"] = date_texts_fin - - if scrape_likes: - like_texts_fin = like_texts[:kisa] - df["Yorumun Aldığı Beğeni Sayısı"] = like_texts_fin - - if scrape_title: - title_text_fin = title_text[:kisa] - df["Video Başlığı"] = title_text_fin - - df.to_excel(dosya_adi, header = True, index = False) - x = "Çektiğiniz veriler " + dosya_adi + " adlı excel dosyasına kaydedildi." - print(x) - - print(""" - -------------------------------------------------------------------------- - - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - - - Github Hesabım: ardauzunoglu - - -------------------------------------------------------------------------- - """) - - time.sleep(3) + driver.execute_script("arguments[0].scrollIntoView();", comment_section) + time.sleep(delay+2) + + comment_count = driver.find_element_by_class_name("count-text.ytd-comments-header-renderer").text + comment_count = comment_count.replace(" Yorum","") + comment_count = comment_count.replace(".","") + comment_count = int(comment_count) + + last_height = driver.execute_script("return document.documentElement.scrollHeight") + + while True: + driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);") + new_height = driver.execute_script("return document.documentElement.scrollHeight") + if new_height == last_height: + break + last_height = new_height + + l = 1 + while l <= comment_count: + try: + comments = driver.find_elements_by_xpath("//*[@id='contents']/ytd-comment-thread-renderer") + comment = comments[l-1] + print("Veri çekiliyor...") + print("Yorum: " + str(l)) + author = comment.find_element_by_id("author-text").text + date = comment.find_element_by_class_name("published-time-text").text + comment_text = comment.find_element_by_id("content-text").text + likes = comment.find_element_by_id("vote-count-middle").text + + author_texts.append(author) + date_texts.append(date) + comment_texts.append(comment_text) + like_texts.append(likes) + title_text.append(title) + + except: + break + + l += 1 + + driver.close() + + length_list = [comment_texts, author_texts, date_texts, like_texts, title_text] + limit = map(len, length_list) + limit = min(list(limit)) + limit -= 1 + + comment_texts_fin = comment_texts[:limit] + df = pd.DataFrame({"Yorumlar":comment_texts_fin}) + if scrape_author: + author_texts_fin = author_texts[:limit] + df["Kullanıcı"] = author_texts_fin + + if scrape_date: + date_texts_fin = date_texts[:limit] + df["Yorum Tarihi"] = date_texts_fin + + if scrape_like: + like_texts_fin = like_texts[:limit] + df["Yorumun Aldığı Beğeni Sayısı"] = like_texts_fin + + if scrape_title: + title_text_fin = title_text[:limit] + df["Video Başlığı"] = title_text_fin + + df.to_excel(file, header = True, index = False) + x = "Çektiğiniz veriler " + file + " adlı excel dosyasına kaydedildi." + print(x) + + print(""" + -------------------------------------------------------------------------- + - Projeden memnun kaldıysanız Github üzerinden yıldızlamayı unutmayın. - + - Github Hesabım: ardauzunoglu - + -------------------------------------------------------------------------- + """) + + time.sleep(3) + initialize() + scrape() if __name__ == "__main__": - initialize() - scrape() \ No newline at end of file + youtube_scrape() \ No newline at end of file