-
Notifications
You must be signed in to change notification settings - Fork 0
/
Classification.Rmd
187 lines (146 loc) · 6.01 KB
/
Classification.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
```{r data import}
loc<-"http://archive.ics.uci.edu/ml/machine-learning/databases/"
ds<-"breast-cancer-wisconsin/breast-cancer-wisconsin.data"
url<-paste(loc,ds,sep="")#数据来源网站
breast<-read.table(file.choose(),
sep=",",header = F,na.strings = "?")
names(breast)<-c("ID","clumpThickness","sizeUniformity",
"shapeUniformity","marginalAdheresion",
"singleEpithelialCellSize","bareNuclei",
"blandChromatin","normalNucleoli","mitosis","class")
#训练集
df<-breast[-1]
df$class<-factor(df$class,levels=c(2,4),labels = c("benign","malignant"))
set.seed(1234)
train<-sample(nrow(df),0.7*nrow(df))
train
df.train<-df[train,]
df.train#用于训练模型
df.validate<-df[-train,]
df.validate#用于实践已训练模型
table(df.train$class)
table(df.validate$class)
```
```{r 逻辑回归}
fit.logic<-glm(class~.,data=df.train,family=binomial())
#拟合逻辑回归,class为响应变量,其余为类型变量
summary(fit.logic)#检查模型
prob<-predict(fit.logic,df.validate,type = "response")#默认输出肿瘤为恶性的对数概率,response得到肿瘤为恶性的概率
logic.pred<-factor(prob>0.5,levels=c(F,T),
labels=c("benign","malignant"))#按0.5分类
logic.perf<-table(df.validate$class,logic.pred,dnn=c("actual","predicted"))#confusion matrix:预测与实际对比
logic.perf
#精简的模型
logic.fit.reduced<-step(fit.logic)#逐步逻辑回归
```
```{r 决策树}
library(rpart)
set.seed(1234)
dtree<-rpart(class~.,data=df.train,method="class",
parms=list(split="information"))#生成树
summary(dtree)
dtree$cptable
plotcp(dtree)#交叉验证误差与复杂度参数的关系图,选虚线下最左端cp值对应的树
dtree.pruned<-prune(dtree,cp=0.0125)#根据复杂度减掉不必要的枝
library(rpart.plot)
prp(dtree.pruned,type=2,extra=104,
fallen.leaves=T,main="Decision Tree")
#PRP画最终决策树,type画出每个节点下分割的标签,extra=104画出每一类的概率以及每一个节点的样本占比,
#fallen leaves 显示终端节点
dtree.pred<-predict(dtree.pruned,df.validate,type = "class")#验证集中观测点分类
dtree.perf<-table(df.validate$class,dtree.pred,
dnn=c("Actual","Predicted"))
dtree.perf
```
cp:复杂度参数
nsplit:分支数
rel error:各种树对应误差
xerror:训练样本所得交叉验证误差
xstd:交叉验证误差的标准差
```{r 条件推断树}
library(party)
fit.ctree<-ctree(class~.,data=df.train)
plot(fit.ctree,main="Conditional Inference Tree")
ctree.pred<-predict(fit.ctree,df.validate,type="response")
ctree.perf<-table(df.validate$class,ctree.pred,dnn = c("Actual","predicter"))
ctree.perf
```
```{r random forest 经典随机森林}
library(randomForest)#可以有缺失值
#默认生成500棵树,每个节点抽取sqrt(变量数)个
set.seed(1234)
fit.forest<-randomForest(class~.,data=df.train,na.action=na.roughfix,importance=T)
#na.action=na.roughfix将缺失数值变为中位数,变量替换为众数,importance输出重要性
fit.forest#生成森林(训练用)
importance(fit.forest,type = 2)#变量重要性,type=2参数得到的变量相对重要性就是分割该变量的节点不纯度的下降总量对所有树取平均,Gini表示节点不纯度
forest.pred<-predict(fit.forest,df.validate)
forest.perf<-table(df.validate$class,forest.pred,dnn = c("Actual","Predicted"))
forest.perf
```
```{r random forest 条件推断随机森林 #害没成功}
library(party)
set.seed(1234)
fit.cforest<-cforest(class~.,data=df.train)
fit.cforest
cforest.pred<-predict(fit.cforest,df.validate,type="response")
cforest.perf<-table(df.validate$class,cforest.pred,dnn = c("Actual","Predicter"))
```
```{r SVM 支持向量机}
library(e1071)#不能有缺失值
set.seed(1234)
fit.svm<-svm(class~.,data=df.train)
fit.svm
svm.pred<-predict(fit.svm,na.omit(df.validate))
svm.perf<-table(na.omit(df.validate)$class,svm.pred,dnn=c("Actual","Predictor"))
svm.perf
```
```{r 带RBF核的支持向量机}
#gamma:gamma与训练样本量成正比,默认为变量个数倒数
#cost:惩罚,与误差成反比,过大过拟合,过小欠拟合,默认为1
set.seed(1234)
tuned<-tune.svm(class~.,data=df.train,gamma = 10^(-6:1),cost = 10^(-10:10))#数值尝试
tuned#输出最优模型
fit.svm<-svm(class~.,data=df.train,gamma=0.01,cost=1)
svm.pred<-predict(fit.svm,na.omit(df.validate))
svm.perf<-table(na.omit(df.validate)$class,svm.pred,dnn=c("Actual","Predictor"))
svm.perf#略减少错误
```
```{r accuracy}
performance<-function(table,n=2){
if(!all(dim(table)==c(2,2)))
stop("Must be a 2x2 table")
tn<-table[1,1]#负例中被成功预测的
fp<-table[1,2]#负例中预测错的
fn<-table[2,1]#正例中预测错的
tp<-table[2,2]#正例中被成功预测的
sensitivity<-tp/(tp+fn)#敏感度
specificity<-tn/(tn+fp)#特异性
ppp<-tp/(tp+fp)#正例命中率
npp<-tn/(tn+fn)#负例命中率
hitrate<-(tp+tn)/(tp+tn+fp+fn)#准确率
result<-paste("sensitivity = ",round(sensitivity,n),#n为保留多少位
"\nspecificity = ",round(specificity,n),
"\nnagative predictive value = ",round(npp,n),
"\npositive predictive value = ",round(ppp,n),
"\naccuracy = ",round(hitrate,n),"\n\n",sep = " ")
cat(result)#cat输出
}
performance(logic.perf)
performance(dtree.perf)
performance(ctree.perf)
performance(forest.perf)
performance(svm.perf)
```
```{r}
library(rattle)
library(RGtk2)
rattle()
```
```{r}
loc<-"http://archive.ics.uci.edu/ml/machine-learning-databases/"
ds<-"pima-indians-diabetes/pima-indians-diabetes.data"
url<-paste(loc,ds,sep = "")
diabetes<-read.table(file.choose(),sep=",",header = F)
names(diabetes)=c("npregnant","plasma","bp","triceps","insulin","bmi","pedigree","age","class")
diabetes$class<-factor(diabetes$class,levels = c(0,1),labels = c("normal","diabetic"))
```