-
Notifications
You must be signed in to change notification settings - Fork 10
/
review.r
261 lines (199 loc) · 11.4 KB
/
review.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
require(plyr)
require(randomForest)
#require(geosphere)
setwd('~/git/event')
source('util.r')
attend=read.csv('data/event_attendees.csv')
event=read.csv('data/events.csv')
user=read.csv('data/users.csv',stringsAsFactors=FALSE)
train=read.csv('train.csv')
test=read.csv('test.csv')
friends=read.csv('data/user_friends.csv')
#train=(train[!duplicated(train[,c('user','event')]),])
db=merge((train),user,by.y=1,by.x=1)
source('populiarity.r')
db=cbind(db,friends_yes=attend_yes,friends_no=attend_no,friends_maybe=attend_maybe,friends=attend_invited)
event_merge=merge(event,yes,by.x=1,by.y=1)
event_merge$populiarity=log(event_merge$populiarity+1)
db=merge(db,event_merge,by.y=1,by.x=2)
#####data preparation######
db$birthyear[-(grep('^\\d{4}',db$birthyear))]=median(as.numeric(db$birthyear[(grep('^\\d{4}',db$birthyear))]))#1977
db$birthyear=as.numeric(as.character(db$birthyear))
db$start_time=as.POSIXct(strptime(as.character((db$start_time)),'%Y-%m-%dT%H:%M:%S'),tz='UTC')
db$timestamp=as.POSIXct(strptime(as.character((db$timestamp)),'%Y-%m-%d %H:%M:%S'),tz='UTC')
db$timezone[which(is.na(db$timezone))]=0
db$time_diff=as.numeric(difftime(db$start_time,db$timestamp,units=c('hours')))#-db$timezone/60
db$timezone=cut(round(db$timezone/60),breaks=seq(-14,14,2))
db$locale=factor(sapply(as.character(db$locale),function(x)strsplit(x,"_")[[1]][1]))
db$invited=factor(db$invited)
db$joinedAt=as.numeric(as.POSIXct(as.character((db$joinedAt)))-as.POSIXct('2000-01-01'))
db$gender=factor(db$gender)
db$weekdays=factor(format(db$timestamp,'%a'))#cut(as.numeric(format(db$timestamp,'%H')),breaks=seq(from=0,to=24,by=3),right=FALSE)##
db$start_hour=factor(format(db$start_time,'%H'))#+user_time_diff*60
tmp=sapply((db$location),function(x){
addr=strsplit(x," ")
ifelse(length(grep("[[:digit:]]",(addr[[1]][length(addr[[1]])])))>0,1,0)
})
addr=as.character(sapply(db$location[which(as.numeric(tmp)==1)],function(x){
y=strsplit(x," ")[[1]]
y[[1]][1:(length(y[[1]])-1)]
}))
db$location[which(as.numeric(tmp)==1)]=addr
db$location_mat=apply(db[,c('country','location')],1,function(x)ifelse(nchar(as.character(x[1]))>1,grep(as.character(x[1]),as.character(x[2]),ignore.case=TRUE),0))+
apply(db[,c('city','location')],1,function(x)ifelse(nchar(as.character(x[1]))>1,grep(as.character(x[1]),as.character(x[2]),ignore.case=TRUE),0))+
apply(db[,c('state','location')],1,function(x)ifelse(nchar(as.character(x[1]))>1,grep(as.character(x[1]),as.character(x[2]),ignore.case=FALSE),0))
db$location_mat[is.na(db$location_mat)]=0
unique_users=unique(db$user)#2015
set.seed(333)
train_nr=unique_users[sample(1:length(unique_users),1338)]#
train_nr=which(db$user %in%train_nr)
####feature selection#####
interested=db[,c(match(c('interested','not_interested',#'distance',#'frequency',
'invited','birthyear','gender'
# ,'weekdays','start_hour'
# ,'hour'
# ,'user_id'
# ,'month'
,'timezone'
# ,'weekdays','start_hour'
,'locale'
,'populiarity'
,'time_diff'
,'friends'
,'friends_yes','friends_no','friends_maybe','joinedAt'
),colnames(db))
,grep('c_',colnames(db))
,grep('location',colnames(db))
)]
set.seed(333)
features=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=interested,importance=TRUE,ntree=150)#,nodesize=1)
z=30
cols= rownames(importance(features)[order(importance(features)[,5],decreasing=TRUE),])[1:z][which(rownames(importance(features)[order(randomForest::importance(features)[,5],decreasing=TRUE),])[1:z]%in%rownames(importance(features)[order(randomForest::importance(features)[,4],decreasing=TRUE),])[1:z])]
cols=c(cols[1:19],'weekdays','start_hour')
#hardcoded features after cherry picking
cols=c("time_diff","friends", "populiarity",
#"country",
#"distance",
"joinedAt", "birthyear","c_other", "friends_yes",
"timezone","friends_maybe","c_6","friends_no","c_1", "locale", "c_52","c_3", "c_21", "c_2", "c_4", "weekdays",
"start_hour",
"location_mat"
#"country.1", "countryIndonesia","countryCambodia",
#"locationSpain",,"locationYogyakarta","location.1","locationCalifornia"
#locationIndonesia","locationOntario"
# ,"countryUnited.States", "countryDominican.Republic", "countryCanada","countryMauritius","countryUnited.Kingdom", "countryItaly" , "countryAustralia", "countryGreece" ,"countryHong.Kong","countryIndia", "countryUganda","countryFinland","countrySaint.Vincent.and.the.Grenadines", "countrySingapore","countryMexico","countrySpain","countryFrance","countryPakistan","countrySwitzerland"
)
#cols=c("time_diff","friends", "populiarity", "joinedAt", "distance", "birthyear","c_other", "friends_yes","timezone", "friends_maybe" ,"c_6", "friends_no","locale","c_1","c_2","c_52","c_3","c_5","c_4", "c_7", "c_9", "c_10","c_34")
interested=db[train_nr,c(match(c('interested','not_interested',cols),colnames(db)))]
set.seed(333)
rez=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=interested,importance=TRUE,ntree=150,nodesize=4)
pred_data=db[-train_nr,c(match(c('event','user','interested','not_interested',cols),colnames(db)))]
pred=predict(rez,pred_data[,-4],type='prob')
benchmark_data=pred_data[,1:4]
benchmark_data[,4]=(benchmark_data[,3]-benchmark_data[,4])/2+.5
pred_data=cbind(pred_data[,1:3],pred[,3])
benchmark_rez=ddply(benchmark_data,.(user),function(x)
{
data.frame(event=output(x,.99));
})
pred_rez=ddply(pred_data,.(user),function(x)
{
data.frame(event=output(x,.0001));
})
print(mapk(200,strsplit(as.character(sub("[[:space:]]+$",'',benchmark_rez[,2])),' '),strsplit(as.character(sub("[[:space:]]+$",'',pred_rez[,2])),' ')))
#final 0.6989714
#public 0.7284886
public=read.csv('public_leaderboard_solution.csv')
public=data.frame(event=public[,2],user=public[,1],0,1)
public_rez=ddply(public,.(user),function(x)
{
data.frame(event=output(x,.99));
})
###############FINAL#############
ids=read.csv('event_popularity_benchmark_private_test_only.csv')
db_test=test
db_test=merge((db_test),user,by.y=1,by.x=1)
db_test=merge(db_test,event_merge,by.y=1,by.x=2,all.x=TRUE)
#####data preparation######
attend_yes_final=apply((db_test[,1:2]),1,function(x){
fnd=strsplit(as.character(friends[which(friends$user==x[2]),]$friends),' ')[[1]]
length(which(fnd %in% strsplit(as.character(attend[(which(attend$event%in%x[1])),]$yes),' ')[[1]]))/length(fnd)
#print(paste(fnd,att))
#print(str(x))
})
attend_no_final=apply((db_test[,1:2]),1,function(x){
fnd=strsplit(as.character(friends[which(friends$user==x[2]),]$friends),' ')[[1]]
length(which(fnd %in% strsplit(as.character(attend[(which(attend$event%in%x[1])),]$no),' ')[[1]]))/length(fnd)
#print(paste(fnd,att))
#print(str(x))
})
attend_maybe_final=apply((db_test[,1:2]),1,function(x){
fnd=strsplit(as.character(friends[which(friends$user==x[2]),]$friends),' ')[[1]]
length(which(fnd %in% strsplit(as.character(attend[(which(attend$event%in%x[1])),]$maybe),' ')[[1]]))/length(fnd)
#print(paste(fnd,att))
#print(str(x))
})
attend_invited_final=apply((db_test[,1:2]),1,function(x){
fnd=strsplit(as.character(friends[which(friends$user==x[2]),]$friends),' ')[[1]]
length(which(fnd %in% strsplit(as.character(attend[(which(attend$event%in%x[1])),]$invited),' ')[[1]]))/length(fnd)
#print(paste(fnd,att))
#print(str(x))
})
db_test=cbind(db_test,friends=attend_invited_final,friends_yes=attend_yes_final,friends_no=attend_no_final,friends_maybe=attend_maybe_final)
#db_test=db_test[grep('^\\d{4}',db_test$birthyear),]
db_test$birthyear[-(grep('^\\d{4}',db_test$birthyear))]=median(db$birthyear)#1977
db_test$birthyear=as.numeric(as.character(db_test$birthyear))
db_test$start_time=as.POSIXct(strptime(as.character((db_test$start_time)),'%Y-%m-%dT%H:%M:%S'),tz='UTC')
db_test$timestamp=as.POSIXct(strptime(as.character((db_test$timestamp)),'%Y-%m-%d %H:%M:%S'),tz='UTC')
db_test$time_diff=as.numeric(difftime(db_test$start_time,db_test$timestamp,units=c('hours')))
db_test$invited=factor(db_test$invited)
db_test$timezone[which(is.na(db_test$timezone))]=0
test_user_time_diff=db_test$timezone
db_test$timezone=cut(round(db_test$timezone/60),breaks=seq(-14,14,2))#factor(round(db_test$timezone/60))
tmp=(sapply(as.character(db_test$locale),function(x)strsplit(x,"_")[[1]][1]))
tmp[which(tmp%in%unique(tmp)[which(!unique(tmp)%in%levels(db$locale))])]='en'
temp=as.factor(c(as.character(db$locale),tmp))
db_test$locale=(tail(temp,length(tmp)))#droplevels(db_test$locale)
db_test$joinedAt=as.numeric(as.POSIXct(as.character((db_test$joinedAt)))-as.POSIXct('2000-01-01'))
db_test$gender=factor(db_test$gender)
db_test$weekdays=factor(format(db_test$timestamp,'%a'))
db_test$start_hour=factor(format(db_test$start_time,'%H'))
db_test$seeing_time=cut(as.numeric(format(db_test$timestamp+test_user_time_diff*60,'%H')),breaks=seq(from=0,to=24,by=8),right=FALSE)
tmp=sapply((db_test$location),function(x){
addr=strsplit(x," ")
ifelse(length(grep("[[:digit:]]",(addr[[1]][length(addr[[1]])])))>0,1,0)
})
addr=as.character(sapply(db_test$location[which(as.numeric(tmp)==1)],function(x){
y=strsplit(x," ")[[1]]
y[[1]][1:(length(y[[1]])-1)]
}))
db_test$location[which(as.numeric(tmp)==1)]=addr
db_test$location_mat=(apply(db_test[,c('country','location')],1,function(x)ifelse(nchar(as.character(x[1]))>1,grep(as.character(x[1]),as.character(x[2]),ignore.case=TRUE),0))+
apply(db_test[,c('city','location')],1,function(x)ifelse(nchar(as.character(x[1]))>1,grep(as.character(x[1]),as.character(x[2]),ignore.case=TRUE),0))+
apply(db_test[,c('state','location')],1,function(x)ifelse(nchar(as.character(x[1]))>1,grep(as.character(x[1]),as.character(x[2]),ignore.case=FALSE),0)))
db_test$location_mat[is.na(db_test$location_mat)]=0
########predict final#########
tmp=data.frame(interested=1,not_interested=0,db_test[which(db_test$event%in%public$event &db_test$user%in%public$user),])
tmp=tmp[,c(match(c('interested','not_interested',cols),colnames(tmp)))]
final_model=rbind(db[,c(match(c('interested','not_interested',cols),colnames(db)))],tmp)
set.seed(333)
final_model3=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=final_model,importance=TRUE,nodesize=4)#,ntree=500,nodesize=1)
set.seed(33)
final_model1=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=final_model,importance=TRUE,nodesize=4)#,ntree=500,nodesize=1)
set.seed(3)
final_model2=randomForest(factor((interested-not_interested)/2+.5) ~ .,data=final_model,importance=TRUE,nodesize=4)#,ntree=500,nodesize=1)
final_model=combine(final_model3,final_model1,final_model2)
test_selected=db_test[,match(cols,colnames(db_test))]
pred_test=predict(final_model,test_selected,type='prob')
pred_data=cbind(db_test[,1:3],pred_test[,3])
pred_data=ddply(pred_data,.(user),function(x)
{
data.frame(event=output(x,.0001));
})
tmp=merge(pred_data,public_rez,by.x=1,by.y=1,all.y=TRUE)
print(mapk(200,strsplit(as.character(sub("[[:space:]]+$",'',tmp$event.y)),' '),strsplit(as.character(sub("[[:space:]]+$",'',tmp$event.x)),' ')))
#0.6971235
pred_data=pred_data[which(pred_data[,1]%in%ids[,1]),]
colnames(pred_data)=c('User','Events')
pred_data$Events=gsub("[[:space:]]*$","",pred_data$Events)
write.csv2(pred_data,'result.csv',row.names=FALSE,quote=FALSE)