-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtwo.r
81 lines (69 loc) · 4.19 KB
/
two.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#location
tmp=which(!is.na(db$lat)&!is.na(db$lng)&!is.na(db$user_lat)&!is.na(db$user_long))
temp=distVincentyEllipsoid(p1 = cbind(db$user_long,db$user_lat)[tmp,], p2 = cbind(db$lng,db$lat)[tmp,])
#db=data.frame(db,distance=db$lng)
db$distance=median(temp)
db$distance[tmp]=temp
unique_users=unique(db$user)#2015
set.seed(333)
train_nr=unique_users[sample(1:length(unique_users),1338)]#
train_nr=which(db$user %in%train_nr)
#for(z in seq(32,40,by=2)){
#users have coordinates
interested_coord=db[(train_nr),c(match(c('interested','not_interested','distance',
'invited','birthyear','gender'
,'timezone'
,'locale'
,'populiarity'
,'time_diff'
,'friends'
,'friends_yes','friends_no','friends_maybe','joinedAt'
),colnames(db))
,grep('c_',colnames(db))
)]
interested_coord=interested_coord[which(!is.na(interested_coord$distance)),]
set.seed(333)
features_coord=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=interested_coord,importance=TRUE,ntree=150)#,nodesize=1)
cols_coord= rownames(importance(features_coord)[order(randomForest::importance(features_coord)[,5],decreasing=TRUE),])[1:30][which(rownames(importance(features_coord)[order(importance(features_coord)[,5],decreasing=TRUE),])[1:30]%in%rownames(importance(features_coord)[order(randomForest::importance(features_coord)[,4],decreasing=TRUE),])[1:30])]
interested_coord=db[train_nr,c(match(c('interested','not_interested',cols_coord),colnames(db)))]
interested_coord=interested_coord[which(!is.na(interested_coord$distance)),]
set.seed(333)
rez_coord=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=interested_coord,importance=TRUE,ntree=150)#,nodesize=1)
interested=db[train_nr,c(match(c('interested','not_interested','distance',
'invited','birthyear','gender'
,'timezone'
,'locale'
,'populiarity'
,'time_diff'
,'friends'
,'friends_yes','friends_no','friends_maybe','joinedAt'
),colnames(db))
,grep('c_',colnames(db))
)]
interested=interested[which(is.na(interested$distance)),]
interested=interested[-which(colnames(interested)=='distance')]
set.seed(333)
features=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=interested,importance=TRUE,ntree=150)#,nodesize=1)
cols= rownames(importance(features)[order(randomForest::importance(features)[,5],decreasing=TRUE),])[1:30][which(rownames(importance(features)[order(randomForest::importance(features)[,5],decreasing=TRUE),])[1:30]%in%rownames(importance(features)[order(randomForest::importance(features)[,4],decreasing=TRUE),])[1:30])]
interested=interested[,c(match(c('interested','not_interested',cols),colnames(interested)))]
set.seed(333)
rez=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=interested,importance=TRUE,ntree=150)#,nodesize=1)
pred_data=db[-train_nr,c(match(c('event','user','interested','not_interested',cols_coord),colnames(db)))]
pred=predict(rez_coord,pred_data[which(!is.na(pred_data$distance)),-4],type='prob')
pred_data=db[-train_nr,c(match(c('event','user','interested','not_interested','distance',cols),colnames(db)))]
pred=rbind(predict(rez,pred_data[which(is.na(pred_data$distance)),-4],type='prob'),pred)
pred=pred[order(as.numeric(rownames(pred))),]
benchmark_data=pred_data[,1:4]
benchmark_data[,4]=(benchmark_data[,3]-benchmark_data[,4])/2+.5
pred_data=cbind(pred_data[,1:3],pred[,3])
benchmark_rez=ddply(benchmark_data,.(user),function(x)
{
data.frame(event=output(x,.99));
})
#for(i in seq(.25,.35,by=.01)){
pred_rez=ddply(pred_data,.(user),function(x)
{
data.frame(event=output(x,.29));
})
#print(z)
print(mapk(200,strsplit(as.character(benchmark_rez[,2]),' '),strsplit(as.character(pred_rez[,2]),' ')))