-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsparks_foundation_task1.py
128 lines (92 loc) · 3.07 KB
/
sparks_foundation_task1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# -*- coding: utf-8 -*-
"""Sparks_foundation_Task1.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Byy591Sgh1Zt2X74zxB_Uq67Pqlkcc47
Author: Tejal Joshi
The Sparks Foundation Task-1
###Task-1: Prediction using Supervise Machine Learning
####**Predict the percentage of a student based on the number of study hours.**
"""
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
# Reading the dataset
data = pd.read_csv('http://bit.ly/w-data')
print("Data imported successfully!")
"""### **Understanding The Dataset**"""
# Understanding The Data
print('Shape of the dataset: ',data.shape)
print('Description of the data',data.describe())
data.head()
data.tail()
# checking for the missing values
data.isnull().sum()
"""No missing values *found*
## **Visualising the Dataset**
"""
# Visualising the dataset
x = data['Hours'] # x will contain 'Hours' column
y = data['Scores'] # y will contain 'Scores' column
plt.scatter(X,y,label='Hours Vs Scores',color='green')
plt.xlabel('Hours')
plt.ylabel('Scores of students')
plt.grid()
plt.legend()
plt.show()
"""### **Data Preparation**"""
# splitting the dataset
X = data.iloc[:,:-1].values
y = data.iloc[:,1].values
print('X :',X)
print('y :',y)
# Train and test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123)
"""Splitting the dataset into : Train and Test datasets
**Train set consists of 80% of the original data**
**Test set consists of 20% of the original data**
### **Training the Model**
"""
# Train the regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)
# Plotting the regression line
line = model.coef_*X + model.intercept_
# Plotting for the test data
plt.scatter(X, y,color='red')
plt.plot(X, line)
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.title('Regression model visualization')
plt.grid()
plt.show()
"""### **Predicting test set**"""
y_pred = model.predict(X_test)
print(y_pred)
# Comparing the actual scores and predicted scores of the students
dt = pd.DataFrame({'Actual Scores':y_test,'Predicted Scores':y_pred})
print(dt)
# What will be predicted score if a student studies for 9.25 hrs/ day?
hours = [[9.25]]
result = model.predict(hours)
print('Number of hours student studies: ',hours)
print('Predicted score: ',result)
"""### **Visualization of the results**"""
plt.scatter(X_test,y_test,color='red')
plt.plot(X_test,y_pred,color='blue')
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.title('Comparison between Actual and predicted values')
plt.grid()
plt.show()
"""### **Evaluating the Results**"""
from sklearn import metrics
print('Mean Absolute Error:',
metrics.mean_absolute_error(y_test, y_pred))
# Finding accuracy of the model
print('Accuracy score: ',model.score(X_test,y_test)*100)