-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
612 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# News Generation | ||
|
||
## Introduction | ||
|
||
This project is a part of the Teknofest 2024 Türkçe Doğal Dil İşleme competition. The aim of the project is to | ||
generate news title and content from a given image. | ||
|
||
## Dataset | ||
|
||
The dataset is collected from the ![Sabah](https://www.sabah.com.tr/timeline/) news website. | ||
The dataset consist of news titles, news content and images. The dataset is in Turkish Language. | ||
|
||
## Data-Preprocessing | ||
|
||
- Sample Data: | ||
![image](https://isbh.tmgrup.com.tr/sbh/2022/03/31/balikesirde-tarihi-binada-baslayan-ve-restorana-sicrayan-yangin-sonduruldu-1648686422986.jpeg) | ||
|
||
*** | ||
|
||
```python | ||
title = "Balıkesir’de tarihi bina yangında küle döndü" | ||
word_index = {'Balıkesir’de': 9, 'tarihi': 5, 'bina': 3, 'yangında': 7, 'küle': 5, 'döndü': 6 } | ||
tokens: [start_token, 9, 5, 3, 7, 5, 6, end_token] | ||
``` | ||
|
||
| Input | Output | | ||
| ------------------------------------------- | --------- | | ||
| Image + start_token | 9 | | ||
| Image + start_token + 9 | 5 | | ||
| Image + start_token + 9 + 5 | 3 | | ||
| Image + start_token + 9 + 5 + 3 | 7 | | ||
| Image + start_token + 9 + 5 + 3 + 7 | 5 | | ||
| Image + start_token + 9 + 5 + 3 + 7 + 5 | 6 | | ||
| Image + start_token + 9 + 5 + 3 + 7 + 5 + 6 | end_token | | ||
|
||
## Model | ||
|
||
The model is a combination of CNN and LSTM, where the image is fed to the Encoder(CNN) and the output of the CNN is | ||
fed to the Decoder(LSTM) along with the input text. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import pandas as pd | ||
import requests | ||
import os | ||
|
||
def get_data(): | ||
data_path = '../Data/news-data.csv' | ||
try: | ||
df = pd.read_csv(data_path) | ||
df = df.iloc[:2] | ||
return df | ||
except Exception as e: | ||
print(f"Error reading CSV file: {e}") | ||
return pd.DataFrame() | ||
|
||
def download_img(url, index): | ||
try: | ||
response = requests.get(url) | ||
if response.status_code == 200: | ||
img_type = url.split('.')[-1] | ||
img_name = f"img_{index + 1}.{img_type}" | ||
img_path = os.path.join(img_folder, img_name) | ||
|
||
if not os.path.exists(img_path): | ||
with open(img_path, 'wb') as f: | ||
f.write(response.content) | ||
return img_name # Return only the image name | ||
else: | ||
return img_name # Return only the image name | ||
else: | ||
print(f"Failed to download image from {url}. Status code: {response.status_code}") | ||
return None | ||
except Exception as e: | ||
print(f"Error downloading image from {url}: {e}") | ||
return None | ||
|
||
def main(): | ||
df = get_data() | ||
if df.empty: | ||
print("No data to process.") | ||
return | ||
|
||
global img_folder | ||
img_folder = '../Data/imgs' | ||
|
||
if not os.path.exists(img_folder): | ||
os.makedirs(img_folder) | ||
else: | ||
# Delete all files in the folder | ||
for file in os.listdir(img_folder): | ||
file_path = os.path.join(img_folder, file) | ||
try: | ||
if os.path.isfile(file_path): | ||
os.unlink(file_path) | ||
except Exception as e: | ||
print(f"Error deleting file {file_path}: {e}") | ||
|
||
csv_file = '../Data/news-data-with-imgs.csv' | ||
|
||
if os.path.exists(csv_file): | ||
os.remove(csv_file) | ||
|
||
valid_rows = [] | ||
|
||
for i, row in df.iterrows(): | ||
img_url = row['Img_url'] | ||
img_path = download_img(img_url, i) | ||
|
||
if img_path: | ||
row['img_path'] = img_path | ||
valid_rows.append(row) | ||
|
||
valid_df = pd.DataFrame(valid_rows) | ||
try: | ||
valid_df.to_csv(csv_file, index=False) | ||
print(f"Downloaded {len(valid_df)} images") | ||
except Exception as e: | ||
print(f"Error saving CSV file: {e}") | ||
|
||
return valid_df | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
from selenium import webdriver | ||
from bs4 import BeautifulSoup | ||
|
||
from selenium.common.exceptions import TimeoutException | ||
from selenium.common.exceptions import WebDriverException | ||
from selenium.webdriver.chrome.service import Service as ChromeService | ||
from urllib.error import URLError | ||
|
||
import time | ||
|
||
# Belirli bir URL'den haber sayfasını çeken fonksiyon. | ||
def get_news(url="", day_news=False): | ||
global timeoutException_count | ||
global webDriverException_count | ||
global urlError_count | ||
|
||
driver_path = "../chromedriver-win32/chromedriver.exe" | ||
|
||
try: | ||
main_url = "https://www.sabah.com.tr" | ||
|
||
full_url = main_url + url | ||
|
||
service = ChromeService(executable_path=driver_path) | ||
|
||
driver = webdriver.Chrome(service=service) | ||
|
||
# Sayfa boyutunu ayarlama | ||
driver.set_window_size(5, 5) | ||
|
||
# Belirtilen URL'yi açma | ||
driver.get(full_url) | ||
|
||
if day_news: | ||
last_height = -10 | ||
while True: | ||
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") | ||
new_height = driver.execute_script("return document.body.scrollHeight") | ||
|
||
if new_height == last_height: | ||
break | ||
last_height = new_height | ||
time.sleep(1.5) | ||
|
||
# Sayfa kaynağını alıp BeautifulSoup kullanarak parse etme | ||
source = driver.page_source | ||
soup = BeautifulSoup(source, features="html.parser") | ||
|
||
except TimeoutException: | ||
return None | ||
|
||
except WebDriverException: | ||
return None | ||
|
||
except URLError: | ||
return None | ||
|
||
finally: | ||
# WebDriver'ı kapatma | ||
driver.quit() | ||
|
||
# Parse edilmiş HTML içeriğini geri döndürme | ||
return soup | ||
|
||
# Haber içeriğini alan fonksiyon. | ||
def get_content(content_soup, url=""): | ||
# Haber içeriğini tutacak boş bir string | ||
news_content = "" | ||
# Belirli bir URL'ye özgü div yapısını içeren ana div'i bulma | ||
top_div = content_soup.find_all("div", attrs={"class": "col-sm-12 view20" + url}) | ||
# Eğer belirli bir div yapısı bulunursa devam et | ||
if top_div is not None: | ||
# Her bir ana div içinde dolaş | ||
for div in top_div: | ||
# İç içe geçmiş div'ler arasında, içerik div'ini bulma | ||
inner_div = div.find("div", attrs={"class": "newsDetailText"}) | ||
# Eğer içerik div'i bulunursa devam et | ||
if inner_div is not None: | ||
# İçerik div'inin içindeki detay div'ini bulma | ||
detail_div = inner_div.find("div", attrs={"class": "newsBox selectionShareable"}) | ||
# Eğer detay div'i bulunursa devam et | ||
if detail_div is not None: | ||
# Detay div'i içindeki paragrafları bulma | ||
paragraphs = detail_div.find_all('p') | ||
# Eğer paragraflar bulunursa devam et | ||
if paragraphs is not None: | ||
# Her bir paragrafı dolaşarak haber içeriğini birleştirme | ||
for paragraph in paragraphs: | ||
if paragraph is not None: | ||
news_content += paragraph.text | ||
# Oluşturulan haber içeriğini geri döndürme | ||
return news_content | ||
|
||
# 'roza' URL'sinden haber içeriğini alan fonksiyon. | ||
def get_news_from_roza(content_url): | ||
# Haber içeriğini tutacak boş bir string | ||
news_content = "" | ||
|
||
try: | ||
# Belirli bir 'roza' URL'sine ait haber sayfasını çekme | ||
soup = get_news(url=content_url) | ||
|
||
# Haber içeriğini içeren div yapısını bulma | ||
paragraphs = soup.find("div", class_="detail-text-area") | ||
|
||
# Eğer belirli bir div yapısı bulunursa devam et | ||
if paragraphs is not None: | ||
# Her bir paragrafı dolaş | ||
for p in paragraphs.find_all("p"): | ||
# Eğer paragraf içinde "strong" etiketi yoksa, içeriği birleştir | ||
if not p.find("strong"): | ||
news_content += p.get_text(strip=True) + " " | ||
|
||
except Exception as e: | ||
print(f"Error : {e}") | ||
|
||
# Oluşturulan haber içeriğini geri döndürme | ||
return news_content.strip() | ||
|
||
def get_news_from_finans(content_url): | ||
news_content = "" | ||
|
||
soup = get_news(url=content_url) | ||
|
||
# Haber içeriğini içeren div yapısını bulma | ||
paragraphs = soup.find("div", attrs={"class" : "detail-text-area"}) | ||
|
||
# Eğer belirli bir div yapısı bulunursa devam et | ||
if paragraphs is not None: | ||
# Her bir paragrafı dolaş | ||
for p in paragraphs.find_all("p"): | ||
# Eğer paragraf içinde "strong" etiketi yoksa, içeriği birleştir | ||
if p.find("strong") is None: | ||
news_content += p.text | ||
|
||
# Oluşturulan haber içeriğini geri döndürme | ||
return news_content |
Oops, something went wrong.