Skip to content

Commit

Permalink
Web Scraping added
Browse files Browse the repository at this point in the history
  • Loading branch information
fawern committed Aug 27, 2024
1 parent 18c4ea4 commit 4b53001
Show file tree
Hide file tree
Showing 4 changed files with 612 additions and 0 deletions.
39 changes: 39 additions & 0 deletions Web Scraping/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# News Generation

## Introduction

This project is a part of the Teknofest 2024 Türkçe Doğal Dil İşleme competition. The aim of the project is to
generate news title and content from a given image.

## Dataset

The dataset is collected from the ![Sabah](https://www.sabah.com.tr/timeline/) news website.
The dataset consist of news titles, news content and images. The dataset is in Turkish Language.

## Data-Preprocessing

- Sample Data:
![image](https://isbh.tmgrup.com.tr/sbh/2022/03/31/balikesirde-tarihi-binada-baslayan-ve-restorana-sicrayan-yangin-sonduruldu-1648686422986.jpeg)

***

```python
title = "Balıkesir’de tarihi bina yangında küle döndü"
word_index = {'Balıkesir’de': 9, 'tarihi': 5, 'bina': 3, 'yangında': 7, 'küle': 5, 'döndü': 6 }
tokens: [start_token, 9, 5, 3, 7, 5, 6, end_token]
```

| Input | Output |
| ------------------------------------------- | --------- |
| Image + start_token | 9 |
| Image + start_token + 9 | 5 |
| Image + start_token + 9 + 5 | 3 |
| Image + start_token + 9 + 5 + 3 | 7 |
| Image + start_token + 9 + 5 + 3 + 7 | 5 |
| Image + start_token + 9 + 5 + 3 + 7 + 5 | 6 |
| Image + start_token + 9 + 5 + 3 + 7 + 5 + 6 | end_token |

## Model

The model is a combination of CNN and LSTM, where the image is fed to the Encoder(CNN) and the output of the CNN is
fed to the Decoder(LSTM) along with the input text.
82 changes: 82 additions & 0 deletions Web Scraping/downloadIMGS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import pandas as pd
import requests
import os

def get_data():
data_path = '../Data/news-data.csv'
try:
df = pd.read_csv(data_path)
df = df.iloc[:2]
return df
except Exception as e:
print(f"Error reading CSV file: {e}")
return pd.DataFrame()

def download_img(url, index):
try:
response = requests.get(url)
if response.status_code == 200:
img_type = url.split('.')[-1]
img_name = f"img_{index + 1}.{img_type}"
img_path = os.path.join(img_folder, img_name)

if not os.path.exists(img_path):
with open(img_path, 'wb') as f:
f.write(response.content)
return img_name # Return only the image name
else:
return img_name # Return only the image name
else:
print(f"Failed to download image from {url}. Status code: {response.status_code}")
return None
except Exception as e:
print(f"Error downloading image from {url}: {e}")
return None

def main():
df = get_data()
if df.empty:
print("No data to process.")
return

global img_folder
img_folder = '../Data/imgs'

if not os.path.exists(img_folder):
os.makedirs(img_folder)
else:
# Delete all files in the folder
for file in os.listdir(img_folder):
file_path = os.path.join(img_folder, file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception as e:
print(f"Error deleting file {file_path}: {e}")

csv_file = '../Data/news-data-with-imgs.csv'

if os.path.exists(csv_file):
os.remove(csv_file)

valid_rows = []

for i, row in df.iterrows():
img_url = row['Img_url']
img_path = download_img(img_url, i)

if img_path:
row['img_path'] = img_path
valid_rows.append(row)

valid_df = pd.DataFrame(valid_rows)
try:
valid_df.to_csv(csv_file, index=False)
print(f"Downloaded {len(valid_df)} images")
except Exception as e:
print(f"Error saving CSV file: {e}")

return valid_df

if __name__ == '__main__':
main()
137 changes: 137 additions & 0 deletions Web Scraping/getNewsSoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from selenium import webdriver
from bs4 import BeautifulSoup

from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService
from urllib.error import URLError

import time

# Belirli bir URL'den haber sayfasını çeken fonksiyon.
def get_news(url="", day_news=False):
global timeoutException_count
global webDriverException_count
global urlError_count

driver_path = "../chromedriver-win32/chromedriver.exe"

try:
main_url = "https://www.sabah.com.tr"

full_url = main_url + url

service = ChromeService(executable_path=driver_path)

driver = webdriver.Chrome(service=service)

# Sayfa boyutunu ayarlama
driver.set_window_size(5, 5)

# Belirtilen URL'yi açma
driver.get(full_url)

if day_news:
last_height = -10
while True:
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
new_height = driver.execute_script("return document.body.scrollHeight")

if new_height == last_height:
break
last_height = new_height
time.sleep(1.5)

# Sayfa kaynağını alıp BeautifulSoup kullanarak parse etme
source = driver.page_source
soup = BeautifulSoup(source, features="html.parser")

except TimeoutException:
return None

except WebDriverException:
return None

except URLError:
return None

finally:
# WebDriver'ı kapatma
driver.quit()

# Parse edilmiş HTML içeriğini geri döndürme
return soup

# Haber içeriğini alan fonksiyon.
def get_content(content_soup, url=""):
# Haber içeriğini tutacak boş bir string
news_content = ""
# Belirli bir URL'ye özgü div yapısını içeren ana div'i bulma
top_div = content_soup.find_all("div", attrs={"class": "col-sm-12 view20" + url})
# Eğer belirli bir div yapısı bulunursa devam et
if top_div is not None:
# Her bir ana div içinde dolaş
for div in top_div:
# İç içe geçmiş div'ler arasında, içerik div'ini bulma
inner_div = div.find("div", attrs={"class": "newsDetailText"})
# Eğer içerik div'i bulunursa devam et
if inner_div is not None:
# İçerik div'inin içindeki detay div'ini bulma
detail_div = inner_div.find("div", attrs={"class": "newsBox selectionShareable"})
# Eğer detay div'i bulunursa devam et
if detail_div is not None:
# Detay div'i içindeki paragrafları bulma
paragraphs = detail_div.find_all('p')
# Eğer paragraflar bulunursa devam et
if paragraphs is not None:
# Her bir paragrafı dolaşarak haber içeriğini birleştirme
for paragraph in paragraphs:
if paragraph is not None:
news_content += paragraph.text
# Oluşturulan haber içeriğini geri döndürme
return news_content

# 'roza' URL'sinden haber içeriğini alan fonksiyon.
def get_news_from_roza(content_url):
# Haber içeriğini tutacak boş bir string
news_content = ""

try:
# Belirli bir 'roza' URL'sine ait haber sayfasını çekme
soup = get_news(url=content_url)

# Haber içeriğini içeren div yapısını bulma
paragraphs = soup.find("div", class_="detail-text-area")

# Eğer belirli bir div yapısı bulunursa devam et
if paragraphs is not None:
# Her bir paragrafı dolaş
for p in paragraphs.find_all("p"):
# Eğer paragraf içinde "strong" etiketi yoksa, içeriği birleştir
if not p.find("strong"):
news_content += p.get_text(strip=True) + " "

except Exception as e:
print(f"Error : {e}")

# Oluşturulan haber içeriğini geri döndürme
return news_content.strip()

def get_news_from_finans(content_url):
news_content = ""

soup = get_news(url=content_url)

# Haber içeriğini içeren div yapısını bulma
paragraphs = soup.find("div", attrs={"class" : "detail-text-area"})

# Eğer belirli bir div yapısı bulunursa devam et
if paragraphs is not None:
# Her bir paragrafı dolaş
for p in paragraphs.find_all("p"):
# Eğer paragraf içinde "strong" etiketi yoksa, içeriği birleştir
if p.find("strong") is None:
news_content += p.text

# Oluşturulan haber içeriğini geri döndürme
return news_content
Loading

0 comments on commit 4b53001

Please sign in to comment.