Web Scraping added

VisionLang · Aug 27, 2024 · 4b53001 · 4b53001
1 parent 18c4ea4
commit 4b53001
Show file tree

Hide file tree

Showing 4 changed files with 612 additions and 0 deletions.
diff --git a/Web Scraping/README.md b/Web Scraping/README.md
@@ -0,0 +1,39 @@
+# News Generation
+
+## Introduction
+
+This project is a part of the Teknofest 2024 Türkçe Doğal Dil İşleme competition. The aim of the project is to
+generate news title and content from a given image.
+
+## Dataset
+
+The dataset is collected from the ![Sabah](https://www.sabah.com.tr/timeline/) news website.
+The dataset consist of news titles, news content and images. The dataset is in Turkish Language.
+
+## Data-Preprocessing
+
+- Sample Data:
+  ![image](https://isbh.tmgrup.com.tr/sbh/2022/03/31/balikesirde-tarihi-binada-baslayan-ve-restorana-sicrayan-yangin-sonduruldu-1648686422986.jpeg)
+
+  ***
+
+  ```python
+    title = "Balıkesir’de tarihi bina yangında küle döndü"
+    word_index = {'Balıkesir’de': 9, 'tarihi': 5, 'bina': 3, 'yangında': 7, 'küle': 5, 'döndü': 6 }
+    tokens: [start_token, 9, 5, 3, 7, 5, 6, end_token]
+  ```
+
+  | Input                                       | Output    |
+  | ------------------------------------------- | --------- |
+  | Image + start_token                         | 9         |
+  | Image + start_token + 9                     | 5         |
+  | Image + start_token + 9 + 5                 | 3         |
+  | Image + start_token + 9 + 5 + 3             | 7         |
+  | Image + start_token + 9 + 5 + 3 + 7         | 5         |
+  | Image + start_token + 9 + 5 + 3 + 7 + 5     | 6         |
+  | Image + start_token + 9 + 5 + 3 + 7 + 5 + 6 | end_token |
+
+## Model
+
+The model is a combination of CNN and LSTM, where the image is fed to the Encoder(CNN) and the output of the CNN is
+fed to the Decoder(LSTM) along with the input text.
diff --git a/Web Scraping/downloadIMGS.py b/Web Scraping/downloadIMGS.py
@@ -0,0 +1,82 @@
+import pandas as pd
+import requests
+import os
+
+def get_data():
+    data_path = '../Data/news-data.csv'
+    try:
+        df = pd.read_csv(data_path)
+        df = df.iloc[:2]
+        return df
+    except Exception as e:
+        print(f"Error reading CSV file: {e}")
+        return pd.DataFrame()
+
+def download_img(url, index):
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            img_type = url.split('.')[-1]
+            img_name = f"img_{index + 1}.{img_type}"
+            img_path = os.path.join(img_folder, img_name)
+
+            if not os.path.exists(img_path):
+                with open(img_path, 'wb') as f:
+                    f.write(response.content)
+                return img_name  # Return only the image name
+            else:
+                return img_name  # Return only the image name
+        else:
+            print(f"Failed to download image from {url}. Status code: {response.status_code}")
+            return None
+    except Exception as e:
+        print(f"Error downloading image from {url}: {e}")
+        return None
+
+def main():
+    df = get_data()
+    if df.empty:
+        print("No data to process.")
+        return
+
+    global img_folder
+    img_folder = '../Data/imgs'
+
+    if not os.path.exists(img_folder):
+        os.makedirs(img_folder)
+    else:
+        # Delete all files in the folder
+        for file in os.listdir(img_folder):
+            file_path = os.path.join(img_folder, file)
+            try:
+                if os.path.isfile(file_path):
+                    os.unlink(file_path)
+            except Exception as e:
+                print(f"Error deleting file {file_path}: {e}")
+
+    csv_file = '../Data/news-data-with-imgs.csv'
+
+    if os.path.exists(csv_file):
+        os.remove(csv_file)
+
+    valid_rows = []
+
+    for i, row in df.iterrows():
+        img_url = row['Img_url']
+        img_path = download_img(img_url, i)
+
+        if img_path:
+            row['img_path'] = img_path
+            valid_rows.append(row)
+
+    valid_df = pd.DataFrame(valid_rows)
+    try:
+        valid_df.to_csv(csv_file, index=False)
+        print(f"Downloaded {len(valid_df)} images")
+    except Exception as e:
+        print(f"Error saving CSV file: {e}")
+
+    return valid_df
+
+if __name__ == '__main__':
+    main()
diff --git a/Web Scraping/getNewsSoup.py b/Web Scraping/getNewsSoup.py
@@ -0,0 +1,137 @@
+from selenium import webdriver 
+from bs4 import BeautifulSoup
+
+from selenium.common.exceptions import TimeoutException
+from selenium.common.exceptions import WebDriverException
+from selenium.webdriver.chrome.service import Service as ChromeService
+from urllib.error import URLError
+
+import time
+
+# Belirli bir URL'den haber sayfasını çeken fonksiyon.
+def get_news(url="", day_news=False):
+    global timeoutException_count
+    global webDriverException_count 
+    global urlError_count 
+
+    driver_path = "../chromedriver-win32/chromedriver.exe"
+
+    try:
+        main_url = "https://www.sabah.com.tr"
+
+        full_url = main_url + url
+
+        service = ChromeService(executable_path=driver_path)
+
+        driver = webdriver.Chrome(service=service)
+
+        # Sayfa boyutunu ayarlama
+        driver.set_window_size(5, 5)
+
+        # Belirtilen URL'yi açma
+        driver.get(full_url)
+
+        if day_news:
+            last_height = -10
+            while True:  
+                driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
+                new_height =  driver.execute_script("return document.body.scrollHeight")
+
+                if new_height == last_height:
+                    break
+                last_height = new_height
+                time.sleep(1.5)
+
+        # Sayfa kaynağını alıp BeautifulSoup kullanarak parse etme
+        source = driver.page_source
+        soup = BeautifulSoup(source, features="html.parser")
+
+    except TimeoutException:
+        return None
+
+    except WebDriverException:
+        return None
+
+    except URLError:
+        return None
+
+    finally:
+        # WebDriver'ı kapatma
+        driver.quit()
+
+    # Parse edilmiş HTML içeriğini geri döndürme
+    return soup
+
+# Haber içeriğini alan fonksiyon.
+def get_content(content_soup, url=""):
+    # Haber içeriğini tutacak boş bir string
+    news_content = ""   
+    # Belirli bir URL'ye özgü div yapısını içeren ana div'i bulma
+    top_div = content_soup.find_all("div", attrs={"class": "col-sm-12 view20" + url})
+    # Eğer belirli bir div yapısı bulunursa devam et
+    if top_div is not None:
+        # Her bir ana div içinde dolaş
+        for div in top_div:
+            # İç içe geçmiş div'ler arasında, içerik div'ini bulma
+            inner_div = div.find("div", attrs={"class": "newsDetailText"})
+            # Eğer içerik div'i bulunursa devam et
+            if inner_div is not None:
+                # İçerik div'inin içindeki detay div'ini bulma
+                detail_div = inner_div.find("div", attrs={"class": "newsBox selectionShareable"})
+                # Eğer detay div'i bulunursa devam et
+                if detail_div is not None:
+                    # Detay div'i içindeki paragrafları bulma
+                    paragraphs = detail_div.find_all('p')
+                    # Eğer paragraflar bulunursa devam et
+                    if paragraphs is not None:
+                        # Her bir paragrafı dolaşarak haber içeriğini birleştirme
+                        for paragraph in paragraphs:
+                            if paragraph is not None:
+                                news_content += paragraph.text
+    # Oluşturulan haber içeriğini geri döndürme
+    return news_content
+
+# 'roza' URL'sinden haber içeriğini alan fonksiyon.
+def get_news_from_roza(content_url):
+    # Haber içeriğini tutacak boş bir string
+    news_content = ""
+
+    try:
+        # Belirli bir 'roza' URL'sine ait haber sayfasını çekme
+        soup = get_news(url=content_url)
+
+        # Haber içeriğini içeren div yapısını bulma
+        paragraphs = soup.find("div", class_="detail-text-area")
+
+        # Eğer belirli bir div yapısı bulunursa devam et
+        if paragraphs is not None:
+            # Her bir paragrafı dolaş
+            for p in paragraphs.find_all("p"):
+                # Eğer paragraf içinde "strong" etiketi yoksa, içeriği birleştir
+                if not p.find("strong"):
+                    news_content += p.get_text(strip=True) + " "
+
+    except Exception as e:
+        print(f"Error : {e}")
+
+    # Oluşturulan haber içeriğini geri döndürme
+    return news_content.strip()
+
+def get_news_from_finans(content_url):
+    news_content = ""
+
+    soup = get_news(url=content_url)
+
+    # Haber içeriğini içeren div yapısını bulma
+    paragraphs = soup.find("div", attrs={"class" : "detail-text-area"})
+
+    # Eğer belirli bir div yapısı bulunursa devam et
+    if paragraphs is not None:
+        # Her bir paragrafı dolaş
+        for p in paragraphs.find_all("p"):
+            # Eğer paragraf içinde "strong" etiketi yoksa, içeriği birleştir
+            if p.find("strong") is None:
+                news_content += p.text
+
+    # Oluşturulan haber içeriğini geri döndürme
+    return news_content