-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathheadline.py
290 lines (229 loc) · 9.47 KB
/
headline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Use a pipeline as a high-level helper
from transformers import pipeline
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.optim.lr_scheduler import CosineAnnealingLR
from rouge_score import rouge_scorer
import math
import os
#This is a project for a machine learning app that will generate a headline for the given article(by us)
os.chdir(r'f:\Python Project\Headline app')
# Load datasets
train_path = r'f:\Python Project\Headline app\LABELLED_TRAIN.csv'
dev_path = r'f:\Python Project\Headline app\LABELLED_DEV.csv'
test_path =r'f:\Python Project\Headline app\UNLABELLED_TEST.csv'
# Load datasets into pandas DataFrames
train_df = pd.read_csv(train_path)
dev_df = pd.read_csv(dev_path)
test_df = pd.read_csv(test_path)
class HeadlineDataset(Dataset):
def __init__(self, articles, headlines, tokenizer, max_len=512): # Double underscores in __init__
self.articles = articles
self.headlines = headlines
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self): # Double underscores in __len__
return len(self.articles)
def __getitem__(self, idx): # Double underscores in __getitem__
article = "summarize: " + self.articles[idx]
headline = self.headlines[idx]
inputs = self.tokenizer.encode_plus(
article,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_tensors="pt"
)
targets = self.tokenizer.encode_plus(
headline,
max_length=50,
padding='max_length',
truncation=True,
return_tensors="pt"
)
return {
'input_ids': inputs['input_ids'].squeeze(0),
'attention_mask': inputs['attention_mask'].squeeze(0),
'labels': targets['input_ids'].squeeze(0),
'raw_headline': headline # Added for ROUGE calculation
}
def calculate_rouge_batch(predictions, references):
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = []
for pred, ref in zip(predictions, references):
score = scorer.score(ref, pred)
scores.append(score['rougeL'].fmeasure)
return np.mean(scores)
def train_epoch(model, dataloader, optimizer, scheduler, device):
model.train()
total_loss = 0
all_rouge_scores = []
progress_bar = tqdm(dataloader, desc="Training")
for batch in progress_bar:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
raw_headlines = batch['raw_headline']
optimizer.zero_grad()
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
loss = outputs.loss
total_loss += loss.item()
# Generate headlines for ROUGE calculation
with torch.no_grad():
generated = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=50,
num_beams=5,
early_stopping=True
)
predictions = tokenizer.batch_decode(generated, skip_special_tokens=True)
rouge_score = calculate_rouge_batch(predictions, raw_headlines)
all_rouge_scores.append(rouge_score)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
progress_bar.set_postfix({
'loss': f'{loss.item():.4f}',
'rouge-l': f'{rouge_score:.4f}',
'lr': f'{scheduler.get_last_lr()[0]:.2e}'
})
return total_loss / len(dataloader), np.mean(all_rouge_scores)
def evaluate(model, dataloader, device):
model.eval()
total_loss = 0
all_rouge_scores = []
with torch.no_grad():
for batch in tqdm(dataloader, desc="Evaluating"):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
raw_headlines = batch['raw_headline']
# Calculate loss
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
total_loss += outputs.loss.item()
# Generate headlines for ROUGE calculation
generated = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=50,
num_beams=5,
early_stopping=True
)
predictions = tokenizer.batch_decode(generated, skip_special_tokens=True)
rouge_score = calculate_rouge_batch(predictions, raw_headlines)
all_rouge_scores.append(rouge_score)
return total_loss / len(dataloader), np.mean(all_rouge_scores)
def generate_test_predictions(model, test_df, tokenizer, device):
model.eval()
test_articles = test_df['News Article'].tolist()
predictions = []
with torch.no_grad():
for article in tqdm(test_articles, desc="Generating Test Predictions"):
inputs = tokenizer(
"summarize: " + article,
max_length=512,
padding='max_length',
truncation=True,
return_tensors="pt"
)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
generated = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=50,
num_beams=5,
early_stopping=True
)
prediction = tokenizer.decode(generated[0], skip_special_tokens=True)
predictions.append(prediction)
test_df['Prediction'] = predictions
test_df[['ID', 'Prediction']].to_csv('test_predictions.csv', index=False)
print("Predictions saved to test_predictions.csv")
# Main training setup
print("Initializing model and tokenizer..... Made by Divyanshu Kannaujiya")
tokenizer = T5Tokenizer.from_pretrained('t5-small')
# tokenizer = T5Tokenizer.from_pretrained('t5-base', legacy=False)
model = T5ForConditionalGeneration.from_pretrained('t5-small')
# tokenizer = T5Tokenizer.from_pretrained('./local_t5_base')
# model = T5ForConditionalGeneration.from_pretrained('./local_t5_base')
# Prepare data
X_train = train_df['News Article'].tolist()
y_train = train_df['Caption'].tolist()
X_dev = dev_df['News Article'].tolist()
y_dev = dev_df['Caption'].tolist()
# Create datasets and dataloaders
train_dataset = HeadlineDataset(X_train, y_train, tokenizer)
dev_dataset = HeadlineDataset(X_dev, y_dev, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=8)
# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model.to(device)
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * epochs
scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=1e-6)
# Training loop with ROUGE-L tracking
best_rouge_l = 0
results = []
print("Starting training...")
for epoch in range(epochs):
print(f"\nEpoch {epoch + 1}/{epochs}")
train_loss, train_rouge = train_epoch(model, train_loader, optimizer, scheduler, device)
val_loss, val_rouge = evaluate(model, dev_loader, device)
print(f"Train Loss: {train_loss:.4f}, Train ROUGE-L: {train_rouge:.4f}")
print(f"Val Loss: {val_loss:.4f}, Val ROUGE-L: {val_rouge:.4f}")
results.append({
'epoch': epoch + 1,
'train_loss': train_loss,
'train_rouge': train_rouge,
'val_loss': val_loss,
'val_rouge': val_rouge
})
# Save best model based on ROUGE-L score
if val_rouge > best_rouge_l:
best_rouge_l = val_rouge
model.save_pretrained("Best_headline_model")
tokenizer.save_pretrained("best_headline_model")
print(f"Saved best model with ROUGE-L: {val_rouge:.4f}")
# Print final results
print("\nFinal Results:")
results_df = pd.DataFrame(results)
print(results_df)
# Plot training progress
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(results_df['epoch'], results_df['train_loss'], label='Train Loss')
plt.plot(results_df['epoch'], results_df['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(results_df['epoch'], results_df['train_rouge'], label='Train ROUGE-L')
plt.plot(results_df['epoch'], results_df['val_rouge'], label='Val ROUGE-L')
plt.xlabel('Epoch')
plt.ylabel('ROUGE-L Score')
plt.legend()
plt.tight_layout()
plt.show()
# Generate predictions for test data
print("Generating predictions for test data......Made by Divyanshu Kannaujiya")
generate_test_predictions(model, test_df, tokenizer, device)