.Rhistory

# chi-squared test for outlier
#
# data:  price
# X-squared = 425.85, p-value < 2.2e-16
# alternative hypothesis: highest value 111111112 is an outlier
str(outlier)
outlier$alternative
rm(list = ls9)
rm(list = ls())
### ------------ Delete columns of categorical ------------ ###
DataDir = list.files()[file.info(list.files())$isdir]
# HousePriceTest = read.csv(list.files(DataDir, pattern = "test", full.names = TRUE))
HousePriceTrain = read.csv(list.files(DataDir, pattern = "train", full.names = TRUE))
head(HousePriceTrain)
dim(HousePriceTrain)
## pick some special variable which we want to keep
id = HousePriceTrain$id
time_trade = as.Date(HousePriceTrain$timestamp)
sub_area = HousePriceTrain$sub_area
## load raw data and pick the categorical data we want
DataDir = list.files()[file.info(list.files())$isdir]
HousePriceTrain = read.csv(list.files(DataDir, pattern = "train", full.names = TRUE))
cnames_train = colnames(HousePriceTrain)
cnames_train
grep("thermal_power_plant_raion", cnames_train)
grep("detention_facility_raion", cnames_train)
idx1 = grep("thermal_power_plant_raion", cnames_train)
idx2 = grep("detention_facility_raion", cnames_train)
idx1:idx2
head(HousePriceTrain[, idx1:idx2])
cnames_train[idx1:idx2]
cnames_train = grep("raion", cnames_train)
cnames_train
cnames_train = grep("raion", cnames_train)
cnames_train[idx1:idx2]
cnames_train[grep("raion", cnames_train)]
cnames_train = colnames(HousePriceTrain)
cnames_train[grep("raion", cnames_train)]
cnames_train[idx1:idx2]
sapply(HousePriceTrain, class)
sapply(HousePriceTrain, class) == "factor"
which(sapply(HousePriceTrain, class) == "factor")
idx_1 = which(sapply(HousePriceTrain, class) == "factor")
idx_2 = grep("raion", cnames_train)
idx1:idx2
idx_1 = which(sapply(HousePriceTrain, class) == "factor")
idx_2 = grep("raion", cnames_train)
idx_1
idx_2
union(idx_1, idx_2)
interaction(idx_1, idx_2)
intersect(idx_1, idx_2)
idx1:idx2
## find the categorical variables
# find the index for the specific variables
factor_idx = which(sapply(HousePriceTrain, class) == "factor")
raion_idx = grep("raion", cnames_train)
raion = HousePriceTrain[, intersect(factor_idx, raion_idx)]
type = HousePriceTrain$product_type
sub_area = HousePriceTrain$sub_area
raion
head(raion)
type
categorical = cbind(raion, type, sub_area)
categorical
categorical = cbind(raion, type, sub_area)[-column_to_remove]
categorical = cbind(raion, type, sub_area)[-row_idx_to_keep, ]
rm(list = ls())
### packages
library(vegan) # for isomap
### ---- read data and remove NA rows and columns that could cause multi-collinearity ---- ###
### read data from .rds file
HousePriceRDS = readRDS(list.files(pattern = "cart"))
HousePrice_withNA = HousePriceRDS$data
## remove variables by hand
cnames = colnames(HousePrice_withNA)
# pick the index of variable with "_min" in their names but not including "_min_price"
column_min_idx = setdiff(grep("_min", cnames), grep("_min_price", cnames))
column_cafavg_idx = grep("cafe_avg", cnames)
column_to_remove = union(column_min_idx, column_cafavg_idx)
# pick columns of only complete ones
row_idx_to_keep = complete.cases(HousePrice_withNA)
# remove rows and columns that we decided to remove
HousePrice = HousePrice_withNA[row_idx_to_remove, -column_to_remove]
# split data into id, data, price
id = HousePrice[, "id"]
data = HousePrice[, -match(c("id", "price_doc"), colnames(HousePrice))]
# find index of vector in another vector using "match"
price = HousePrice[, "price_doc"]
HousePrice_withNA = HousePriceRDS$data
## remove variables by hand
cnames = colnames(HousePrice_withNA)
# pick the index of variable with "_min" in their names but not including "_min_price"
column_min_idx = setdiff(grep("_min", cnames), grep("_min_price", cnames))
column_cafavg_idx = grep("cafe_avg", cnames)
column_to_remove = union(column_min_idx, column_cafavg_idx)
# pick columns of only complete ones
row_idx_to_keep = complete.cases(HousePrice_withNA)
# remove rows and columns that we decided to remove
HousePrice = HousePrice_withNA[row_idx_to_remove, -column_to_remove]
# pick columns of only complete ones
row_idx_to_keep = complete.cases(HousePrice_withNA)
# remove rows and columns that we decided to remove
HousePrice = HousePrice_withNA[row_idx_to_remove, -column_to_remove]
# remove rows and columns that we decided to remove
HousePrice = HousePrice_withNA[row_idx_to_keep, -column_to_remove]
# split data into id, data, price
id = HousePrice[, "id"]
data = HousePrice[, -match(c("id", "price_doc"), colnames(HousePrice))]
# find index of vector in another vector using "match"
price = HousePrice[, "price_doc"]
# scale the data
data_scaled = scale(data)
### load raw data and pick the categorical data we want
## read origin data
DataDir = list.files()[file.info(list.files())$isdir]
HousePriceTrain = read.csv(list.files(DataDir, pattern = "train", full.names = TRUE))
cnames_train = colnames(HousePriceTrain)
## find the categorical variables
# find the index for the specific variables
factor_idx = which(sapply(HousePriceTrain, class) == "factor")
raion_idx = grep("raion", cnames_train)
raion = HousePriceTrain[, intersect(factor_idx, raion_idx)]
type = HousePriceTrain$product_type
sub_area = HousePriceTrain$sub_area
categorical = cbind(raion, type, sub_area)[-row_idx_to_keep, ]
categorical
categorical = cbind(raion, type, sub_area)[row_idx_to_keep, ]
## save data in "HousePriceList.rds"
saveRDS(HousePriceList, "HousePriceList.rds")
### Combine data into a list
HousePriceList = list(id = id,
price = price,
continuous = data,
categorical = categorical)
## save data in "HousePriceList.rds"
saveRDS(HousePriceList, "HousePriceList.rds")
train <- read.csv("Russian House Price/train.csv", header= T)
#missing pattern
library(mice)
md.pattern(train)
library(VIM)
windows()
aggr(train,prop=FALSE,numbers=TRUE)
aggr(train,prop=FALSE,numbers=TRUE)
windows()
aggr(train,prop=FALSE,numbers=TRUE)
a <- c(2, 7, 8, 11, 12, 153, 119, 115, 107, 41, 40, 39, 38, 37, 36, 35, 34,
30, 13, 85, 100, 103, 114, 117, 121, 123, 187, 156, 158, 160, 179, 181,
202, 204, 206, 225, 227, 229, 248, 250, 252, 271, 273, 275)
#類別的行數
ncate <- sort(c(a, ))
b <- seq(from=188, to=199, by= 1)
c <- seq(from=69, to=84, by=1)
d <- seq(from=164, to=176, by=1)
e <- seq(from=210, to=222, by=1)
f <- seq(from=233, to=245, by=1)
g <- seq(from=256, to=268, by=1)
h <- seq(from=279, to=291, by=1)
no.cate <- sort(c(a, b, c, d, e, f, g, h))
#類別中不是計數資料的行數
cate.factor <- c(2, 7, 8, 11, 12, 153, 119, 115, 107, 41,
40, 39, 38, 37, 36, 35, 34, 30, 13)
#類別中不是計數資料的行數
cate.factor <- c(2, 7, 8, 11, 12, 153, 119, 115, 107, 41,
40, 39, 38, 37, 36, 35, 34, 30, 13)
#類別
cate <- train[, no.cate]
aggr(cate, prop=TRUE, numbers=TRUE)
windows()
aggr(train,prop=FALSE,numbers=TRUE)
train$state <- as.factor(train$state)
train$material <- as.factor(train$material)
levels(train$material)
#連續
cont <- train[, -no.cate]
windows()
aggr(cont, prop=TRUE, numbers=TRUE)
colnames(cont)
#distribution of price
qqnorm(train$price_doc)
qqline(train$price_doc)
#以area為分類,對log(price)作圖
ggplot(train)+
geom_boxplot(aes(x= sub_area, y=log(price_doc))) +
theme(axis.text.x = element_text(face = "bold", color = "black", size = 5, angle = 90)) +
labs(title = "不同地區log(價格)", x = "log(價格)", y = "地區")
#以area為分類,對log(price)作圖
library(ggplot2)
#以area為分類,對log(price)作圖
library(ggplot2)
ggplot(train)+
geom_boxplot(aes(x= sub_area, y=log(price_doc))) +
theme(axis.text.x = element_text(face = "bold", color = "black", size = 5, angle = 90)) +
labs(title = "不同地區log(價格)", x = "log(價格)", y = "地區")
boxplot(log(train$price_doc))
boxplot(log(train$price_doc), main = "對數(價格)")
#年月成交量
library(magrittr)
train$timestamp <- as.Date(train$timestamp)
train$ym <- format(train$timestamp, "%Y-%m")
volume <- table(train$ym)
mon.day <- seq.Date(from = as.Date("2011/08/01", format="%Y/%m/%d"), by="month",  length.out=length(volume))
plot(x= mon.day, y= volume, type= "o", pch="",
col=2, lwd= 2, main="年月成交量", ylab="成交量", xlab="年月")
#scatter plot of price and train distace
##zd_vokzaly_avto_km:Distance to train station
##ID_railroad_terminal:最近的火車終點站
dis1 <- ggplot(data= train, aes(x= log(price_doc), y=zd_vokzaly_avto_km)) +
geom_point(aes(colour=as.factor(ID_railroad_terminal)), position="jitter") +
labs(title = "價格與火車站距離的關係", x = "log(價格)", y = "與火車站距離") +
guides(colour = guide_legend("最近的火車終點站"))
#scatter plot of price and bus distace
##bus_terminal_avto_km:Distance to bus station
##ID_bus_terminal:最近的公車終點站
dis2 <- ggplot(data= train, aes(x= log(price_doc), y=bus_terminal_avto_km)) +
geom_point(aes(colour=as.factor(ID_bus_terminal)), position="jitter") +
labs(title = "價格與公車站距離的關係", x = "log(價格)", y = "與公車站距離") +
guides(colour = guide_legend("最近的公車終點站"))
windows()
cowplot::plot_grid(dis1, dis2, labels = "AUTO", ncol=1)
source("library.R")
install.packages("cowplot")
cowplot::plot_grid(dis1, dis2, labels = "AUTO", ncol=1)
windows()
warnings()
#時間與用途
##timestamp/ product_type
library(scales)
dateway <- train[, c(2, 12)] %>%  table %>% as.data.frame
dateway$timestamp  <- as.Date(dateway$timestamp)
datebreaks <- seq(as.Date("2011-08-01"), as.Date("2015-06-01"), by="month")
ggplot(data= dateway, aes(x=timestamp, y = Freq, fill = product_type)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "交易時間與用途", x = "交易時間", y = "計數", fill="用途") +
scale_x_date(breaks=  datebreaks, labels=date_format("%Y-%m")) +
theme(axis.ticks=element_blank(), axis.text.x=element_text(angle=90), legend.position="bottom")
#############################
#羅
library(grid)
#價格與公寓空間(有離群和沒有離群)
p1 <- ggplot(aes(x=full_sq, y=price_doc), data=train) +
geom_point(color='#009FCC')+
labs(x='總面積', y='價錢', title='價錢與總面積(含離群值)')
p2 <- train %>%
filter(full_sq < 2000) %>%
ggplot(aes(x=full_sq, y=price_doc)) +
geom_point(color='#009FCC', alpha=0.5) +
labs(x='總面積', y='價錢', title='價錢與總面積(不含離群值)')
p2 <- train %>%
filter(full_sq < 2000) %>%
ggplot(aes(x=full_sq, y=price_doc)) +
geom_point(color='#009FCC', alpha=0.5) +
labs(x='總面積', y='價錢', title='價錢與總面積(不含離群值)')
#source("http://peterhaschke.com/Code/multiplot.R")
#multiplot(p1, p2, cols=2)
#library("gridExtra")
#grid.arrange(p1, p2, ncol=2)
#cowplot::plot_grid(p1, p2, labels = "AUTO")
library(patchwork)
#source("http://peterhaschke.com/Code/multiplot.R")
#multiplot(p1, p2, cols=2)
#library("gridExtra")
#grid.arrange(p1, p2, ncol=2)
#cowplot::plot_grid(p1, p2, labels = "AUTO")
install.packages("patchwork")
#source("http://peterhaschke.com/Code/multiplot.R")
#multiplot(p1, p2, cols=2)
#library("gridExtra")
#grid.arrange(p1, p2, ncol=2)
#cowplot::plot_grid(p1, p2, labels = "AUTO")
library(patchwork)
p1 + p2
p2 <- train %>%
filter(full_sq < 2000) %>%
ggplot(aes(x=full_sq, y=price_doc)) +
geom_point(color='#009FCC', alpha=0.5) +
labs(x='總面積', y='價錢', title='價錢與總面積(不含離群值)')
#平均價格前20地區
train %>% select(sub_area,price_doc)%>% group_by(sub_area)%>%
summarize(count=n(),price=mean(price_doc))%>%
arrange(desc(price))%>%head(n=20)%>%
ggplot(aes(x=factor(sub_area,levels=sub_area),y=price))+
geom_bar(fill="#00BBFF",stat="identity")+
theme(legend.position="none", axis.text.x = element_text(angle=90))+
labs(title="平均價格前20地區",x="地區", y="平均價格")
#平均價格前20地區
library(dplyr)
train %>% select(sub_area,price_doc)%>% group_by(sub_area)%>%
summarize(count=n(),price=mean(price_doc))%>%
arrange(desc(price))%>%head(n=20)%>%
ggplot(aes(x=factor(sub_area,levels=sub_area),y=price))+
geom_bar(fill="#00BBFF",stat="identity")+
theme(legend.position="none", axis.text.x = element_text(angle=90))+
labs(title="平均價格前20地區",x="地區", y="平均價格")
#公寓樓層數與log價格
ggplot(train,aes(x=factor(floor),y=log(price_doc),fill=factor(floor)))+
geom_boxplot(alpha=0.4)+
labs(x="公寓樓層數",y="log(價格)",title="樓層數與價格之關係")+
theme(legend.position="none")
ggplot(train,aes(x=factor(floor),y=price_doc,fill=factor(floor)))+
geom_boxplot(alpha=0.4)+
labs(x="公寓樓層數",y="Price",title="樓層數與價格")+
theme(legend.position="none")
#學校有關變數與價格之corrplot
col1 <- colorRampPalette(c("#7F0000", "red", "#FF7F00", "yellow", "white",
"cyan", "#007FFF", "blue","#00007F"))
school_chars <- c('children_preschool', 'preschool_quota', 'preschool_education_centers_raion',
'children_school', 'school_quota', 'school_education_centers_raion',
'school_education_centers_top_20_raion', 'university_top_20_raion',
'additional_education_raion', 'additional_education_km', 'university_km',
'price_doc')
corrplot(cor(train[, school_chars], use='complete.obs'), col = col1(20))
??corrplot
install.packages("corrplot")
corrplot(cor(train[, school_chars], use='complete.obs'), col = col1(20))
#附近設施距離+價格之corrplot
distance_chars <- c('metro_km_avto', 'metro_km_walk', 'kindergarten_km',
'school_km', 'park_km', 'green_zone_km',
'industrial_km', 'stadium_km',
'cemetery_km', 'market_shop_km', 'railroad_station_walk_km',
'price_doc')
corrplot(cor(train[, distance_chars], use='complete.obs'), col = col1(20))
library(corrpot)
library(corrplot)
corrplot(cor(train[, distance_chars], use='complete.obs'), col = col1(20))
#附近設施距離+價格之corrplot
distance_chars <- c('metro_km_avto', 'metro_km_walk', 'kindergarten_km',
'school_km', 'park_km', 'green_zone_km',
'industrial_km', 'stadium_km',
'cemetery_km', 'market_shop_km', 'railroad_station_walk_km',
'price_doc')
library(corrplot)
corrplot(cor(train[, distance_chars], use='complete.obs'), col = col1(20))
p2 <- train %>%
filter(full_sq < 2000) %>%
ggplot(aes(x=full_sq, y=price_doc)) +
geom_point(color='#009FCC', alpha=0.5) +
labs(x='總面積', y='價錢', title='價錢與總面積(不含離群值)')
#價格與公寓空間(有離群和沒有離群)
p1 <- ggplot(aes(x=full_sq, y=price_doc), data=train) +
geom_point(color='#009FCC')+
labs(x='總面積', y='價錢', title='價錢與總面積(含離群值)')
#交易日期與平均價格
train %>%
group_by(timestamp) %>%
summarize(mean_price = mean(price_doc)) %>%
ggplot(aes(x = timestamp, y = mean_price)) +
geom_line(color = '#009FCC') +
ggtitle('交易日期與每日平均價格')
#log(Price) between Investment and OwnerOccupier
ggplot(aes(x=price_doc), data=train) +
geom_density(fill='#009FCC', color='#009FCC') +
facet_grid(~product_type) +
scale_x_continuous(trans='log')+
labs(x='log(價格)', y='', title='不同交易目的與log(價格)之間的關係')
#購買型態的平均價格對比
train %>% group_by(product_type) %>% summarize(mean(price_doc))
#Dist. of price betweem different product_type
ggplot(train,aes(x= price_doc, fill= product_type))+
geom_density(alpha=0.5)+
labs(title="不同交易用途之價格分布",x="價格" , fill="交易用途")
rm(list=ls())
### packages
library(magrittr)
library(dplyr)
library(mice)
library(missForest)
### ------------ Delete columns of categorical ------------ ###
DataDir = list.files()[file.info(list.files())$isdir]
# HousePriceTest = read.csv(list.files(DataDir, pattern = "test", full.names = TRUE))
HousePriceTrain = read.csv(list.files(DataDir, pattern = "train", full.names = TRUE))
### ------------ Delete columns of categorical ------------ ###
data_dir = list.files()[file.info(list.files())$isdir]
# HousePriceTest = read.csv(list.files(DataDir, pattern = "test", full.names = TRUE))
HousePriceTrain = read.csv(list.files(data_dir, pattern = "train", full.names = TRUE))
head(HousePriceTrain)
dim(HousePriceTrain)
data_dir
list.files(data_dir, pattern = "train", full.names = TRUE)
## pick some special variable which we want to keep
id = HousePriceTrain$id
time_trade = as.Date(HousePriceTrain$timestamp)
sub_area = HousePriceTrain$sub_area
## create column names vector
cnames = colnames(HousePriceTrain)
### remove columns with ("count" or "ID" or "top") in column names
Columns_Count_Top_ID = (grepl("count", cnames) | grepl("ID", cnames) | grepl("top", cnames))
HousePrice_NO_Count_Top_ID = HousePriceTrain[, !Columns_Count_Top_ID]
head(HousePrice_NO_Count_Top_ID)
tables = apply(HousePrice_NO_Count_Top_ID, 2, table)
str(HousePrice_NO_Count_Top_ID)
### check NA proportion in variables
## default setting
AbandonRatio = .2
## get columns with NAs
NAinVar = colSums(is.na(HousePrice_NO_Count_Top_ID))
NAinVar[NAinVar > 0]
## remove columns with too many NAs
ColumnRM_NA = which(NAinVar >= nrow(HousePrice_NO_Count_Top_ID)*AbandonRatio)
HousePrice_NAreduce = HousePrice_NO_Count_Top_ID[, -ColumnRM_NA]
head(HousePrice_NAreduce)
dim(HousePrice_NAreduce)
str(HousePrice_NAreduce)
## remove data which datatype is "factor"
ColumnRM_factor = sapply(HousePrice_NAreduce, class)
HousePrice_FACTORreduce = HousePrice_NAreduce[, ColumnRM_factor != "factor"]
str(HousePrice_FACTORreduce)
dim(HousePrice_FACTORreduce)
### ------------ Combine data with similar info ------------ ###
## Combine male and female data as sex ratio
cnames = colnames(HousePrice_FACTORreduce) # get the columns of data after deleting variables
start = which(grepl("full_all", cnames)) # find variable "full_all"
end = which(grepl("X0_13_female", cnames)) # find variable "X0_13_female"
## seperate data into two parts
data_part1 = HousePrice_FACTORreduce[, -(start:end)] # other data
data_part2 = HousePrice_FACTORreduce[, start:end] # population data
# head(data_part2)
# population data to sex ratio data
ratio = NULL
for(i in 1:ncol(data_part2)){
if(i %% 3 == 1){
ratio = cbind(ratio, data_part2[, i])
}else if(i %% 3 == 2){
ratio = cbind(ratio, (data_part2[, i]/data_part2[, i+1]))
}
}
varnames = gsub("_male", "_ratio", colnames(data_part2))
varnames = varnames[!grepl("female", varnames)]
varnames = gsub("ekder", "elder", varnames)
varnames = gsub("male_f", "full_ratio", varnames)
colnames(ratio) = varnames
columns_to_rm = grepl("X0_1", varnames)
sexRatio = data.frame(ratio[, !columns_to_rm])
## Combine sex ratio and data_part1 and name it as HousePrice
HousePrice = cbind(data_part1, sexRatio)
str(HousePrice)
dim(HousePrice)
write.csv(HousePrice, "HousePrice.csv", row.names = FALSE)
## missing ratio
cat("Missing ratio:", mean(is.na(HousePrice)), "\n")
colMeans(is.na(HousePrice))
### packages
library(mice)
library(missForest)
### ------------ Imputation ------------ ###
## try n error: imputation
HousePrice = read.csv("HousePrice.csv")
## take a look at NA data
str(HousePrice)
NAPerCol = colSums(is.na(HousePrice)) # missing per columns
NAPerCol[NAPerCol > 0] # only columns with missing value
## data of NA for "cafe.*1500"
cafe1500 = HousePrice[, grepl("cafe.*1500", colnames(HousePrice))]
cafe1500[!complete.cases(cafe1500), ]
## data of NA for "cafe.*2000"
cafe2000 = HousePrice[, grepl("cafe.*2000", colnames(HousePrice))]
cafe2000[!complete.cases(cafe2000), ]
imputeFUN = function(data, METHOD, MAXIT = 2, M = 10){
# imputation
start = Sys.time()
impute = mice(data, method = METHOD, maxit = MAXIT, m = M)
end = Sys.time()
timePass = end - start
# save imputed data
saveName = paste(METHOD, "_impute.rds", sep = "")
saveRDS(impute, saveName)
# print time spent for imputation
cat("Time Pass:", timePass)
# return all the output as list
out = list(impute = impute, timePass = timePass)
return(out)
}
## separate list into three parts, "n >> p", "n = p", "n << p"
HousePriceList = readRDS(HousePriceList.rds)
## separate list into three parts, "n >> p", "n = p", "n << p"
HousePriceList = readRDS("HousePriceList.rds")
str(HousrPriceList)
length(HousePriceList$id)
length(HousePriceList$id)
(len = length(HousePriceList$id)) # [1] 26073
1e4
(dim = dim(HousePriceList$continuous)) # [1] 26073
HousePriceList$id
rownames(HousePriceList$continuous)
### separate list into three parts, "n >> p", "n = p", "n << p"
HousePriceList = readRDS("HousePriceList.rds")
(dim = dim(HousePriceList$continuous)) # [1] 26073   114
HousePriceList$id
greater_idx = sample(1:dim[1], dim[1] - 1e4)  # keep 10,000 rows for testing
equal_idx = sample(1:dim[1], dim[2])          # pick the number of rows of the same as columns
smaller_idx = sample(1:dim[1], 20)            # pick 20 rows decided by ourselves
# store data in a list
DifferentDimensionHousrPriceList = list(
n>>p = HousePriceList$continuous[greater_idx, ],
n==p = HousePriceList$continuous[equal_idx, ],
n<<p = HousePriceList$continuous[smaller_idx, ],
id = HousePriceList$id,
price = HousePriceList$price,
categorical = HousePriceList$categorical,
all = HousePriceList$continuous
)
# store data in a list
DifferentDimensionHousrPriceList = list(
`n>>p` = HousePriceList$continuous[greater_idx, ],
`n==p` = HousePriceList$continuous[equal_idx, ],
`n<<p` = HousePriceList$continuous[smaller_idx, ],
id = HousePriceList$id,
price = HousePriceList$price,
categorical = HousePriceList$categorical,
all = HousePriceList$continuous
)