-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassignment 3 sol r.R
49 lines (39 loc) · 1.7 KB
/
assignment 3 sol r.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# a. Load CSV in R by skipping second row
rm(list=ls()) # remove all the objects
setwd('C:/Users/Priyank/Desktop/kannu/DATASET')
getwd()
df=(read.csv("IMDB_data.csv",header=T)[-2,])
temp=(read.csv("IMDB_data.csv",header=T))
colnames(df)
str(df)
dim(df)
head(df,10)
head(df)
tail(df)
tail(df,10)
#b. Extract the unique genres and its count and store in data frame with index key.
# counting frequency of each unique value of Genre with table function
df$Genre = factor(df$Genre)
unique_Genre_with_count = table(df$Genre)
#counting total unique values
length_of_unique_genre = length(unique(df$Genre))
#making dataframe of Genre with index
df1 = data.frame(index = (1:length_of_unique_genre),
unique_Genre_with_count)
colnames(df1)[2:3] = c("Genre", "Count")
#c. Converting the required data type.
df1$Count=as.factor(df1$Count)
str(df1)
# Binning and converting data types
df$imdbRating = as.numeric(df$imdbRating)
df$imdbRatingcat[df$imdbRating >=10 & df$imdbRating <= 50]="Bad Movie"
df$imdbRatingcat[df$imdbRating >=51 & df$imdbRating <= 80]="Good Movie"
df$imdbRatingcat[df$imdbRating >80]="Awesome Movie"
#d. Sort the genre by its name
df=df[order(df$Genre),]# In ascending order
#df=df[order(-df$Genre),] In descending order
# e. Create new variable whose values should be square of difference between imdbrating and imdbvotes
#df$imdbVotes = as.numeric(as.character(df$imdbVotes))
#df$imdbRating = as.numeric(as.character(df$imdbRating))
df['Difference']=(df$imdbVotes-df$imdbRating)^2
#df['(R-V)^2'] = (df$imdbRating - df$imdbVotes)^2