sparks_foundation_task1.py

# -*- coding: utf-8 -*-
"""Sparks_foundation_Task1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Byy591Sgh1Zt2X74zxB_Uq67Pqlkcc47

Author: Tejal Joshi

The Sparks Foundation Task-1
###Task-1: Prediction using Supervise Machine Learning 
####**Predict the percentage of a student based on the number of study hours.**
"""

# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import accuracy_score

# Reading the dataset
data = pd.read_csv('http://bit.ly/w-data')
print("Data imported successfully!")

"""### **Understanding The Dataset**"""

# Understanding The Data
print('Shape of the dataset: ',data.shape)
print('Description of the data',data.describe())

data.head()

data.tail()

# checking for the missing values 
data.isnull().sum()

"""No missing values *found*

## **Visualising the Dataset**
"""

# Visualising the dataset
x = data['Hours'] # x will contain 'Hours' column
y = data['Scores'] # y will contain 'Scores' column
plt.scatter(X,y,label='Hours Vs Scores',color='green')
plt.xlabel('Hours')
plt.ylabel('Scores of students')
plt.grid()
plt.legend()
plt.show()

"""### **Data Preparation**"""

# splitting the dataset
X = data.iloc[:,:-1].values
y = data.iloc[:,1].values

print('X :',X)
print('y :',y)

# Train and test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123)

"""Splitting the dataset into : Train and Test datasets 

**Train set consists of 80% of the original data**

**Test set consists of 20% of the original data**

### **Training the Model**
"""

# Train the regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

# Plotting the regression line
line = model.coef_*X + model.intercept_

# Plotting for the test data

plt.scatter(X, y,color='red')
plt.plot(X, line)
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.title('Regression model visualization')
plt.grid()
plt.show()

"""### **Predicting test set**"""

y_pred = model.predict(X_test)
print(y_pred)

# Comparing the actual scores and predicted scores of the students
dt = pd.DataFrame({'Actual Scores':y_test,'Predicted Scores':y_pred})
print(dt)

# What will be predicted score if a student studies for 9.25 hrs/ day?
hours = [[9.25]]
result = model.predict(hours)
print('Number of hours student studies: ',hours)
print('Predicted score: ',result)

"""### **Visualization of the results**"""

plt.scatter(X_test,y_test,color='red')
plt.plot(X_test,y_pred,color='blue')
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.title('Comparison between Actual and predicted values')
plt.grid()
plt.show()

"""### **Evaluating the Results**"""

from sklearn import metrics  
print('Mean Absolute Error:', 
      metrics.mean_absolute_error(y_test, y_pred))

# Finding accuracy of the model
print('Accuracy score: ',model.score(X_test,y_test)*100)