-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprediction.py
84 lines (64 loc) · 2.93 KB
/
prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Set random seed for reproducibility
np.random.seed(42)
# Load datasets
indeks_ihsg = pd.read_csv(r"D:\stock-sentiment\index-ihsg.csv")
indeks_ihsg = indeks_ihsg.iloc[:, 1:]
indeks_ihsg = indeks_ihsg[['Date', 'Stock Index']]
sentiment = pd.read_csv(r"D:\stock-sentiment\datasentiment.csv")
sentiment = sentiment[['Date', 'Dominant_Sentiment']]
# Merge datasets
df = pd.merge(indeks_ihsg, sentiment, on='Date')
# Encode sentiment
sentiment_encoding = {'positive': 1.0, 'neutral': 0.0, 'negative': -1.0}
df['Sentiment_Encoded'] = df['Dominant_Sentiment'].map(sentiment_encoding)
# Feature 'Stock_Index_1day_ago' (previous day's index value)
df['Stock_Index_1day_ago'] = df['Stock Index'].shift(1)
# Remove rows with missing data
df = df.dropna(subset=['Stock_Index_1day_ago', 'Stock Index'])
# Separate features (X) and target (y)
X = df[['Sentiment_Encoded', 'Stock_Index_1day_ago']]
y = df['Stock Index']
# Split data into training and testing sets
X_train = X[df['Date'] < '2024-10-04']
y_train = y[df['Date'] < '2024-10-04']
X_test = X[df['Date'] >= '2024-10-04']
y_test = y[df['Date'] >= '2024-10-04']
# Create linear regression model
model_lr = LinearRegression()
# Perform cross-validation on training set
cv_scores = cross_val_score(model_lr, X_train, y_train, cv=3) # 3-fold cross-validation
# Train model on training set
model_lr.fit(X_train, y_train)
# Make predictions on test set
y_pred_lr = model_lr.predict(X_test)
# Filter DataFrame for actual values starting from 2024-10-01
df_filtered = df[df['Date'] >= '2024-10-03']
# Plot comparison of predictions and actual values
plt.figure(figsize=(14, 7))
plt.plot(df_filtered['Date'], df_filtered['Stock Index'], label='Actual Stock Trend', marker='o', color='red', markersize=4)
plt.plot(df['Date'][df['Date'] >= '2024-10-04'], y_pred_lr, label='Predicted Stock Trend (Linear Regression)',
marker='o', linestyle='--', color='blue', markersize=4)
# Enhancing plot aesthetics
plt.xlabel('Date', fontsize=14)
plt.ylabel('Stock Changes (%)', fontsize=14)
plt.title('Comparison of Linear Regression Predictions vs Actual IHSG Stock Trend', fontsize=16)
plt.legend(fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
# Invert x-axis to show dates from earliest to latest
plt.gca().invert_xaxis()
plt.tight_layout()
# Show the plot
plt.show()
# Evaluation metrics (MSE & R2 for the model)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"MSE (LR): {mse_lr:.2f}, R2 (LR): {r2_lr:.2f}")
print(f"Cross-validation scores: {cv_scores}")
print(f"Average cross-validation score: {np.mean(cv_scores):.2f}")