-
Notifications
You must be signed in to change notification settings - Fork 2
/
admin.py
executable file
·286 lines (243 loc) · 8.86 KB
/
admin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/env python
# This is a tool for testing and administrative tasks. It is designed to
# be %run in ipython. If you import it from another module, you're doing
# something wrong.
import json
import itertools
import time
import logging
import heapq
import getopt
import sys
import cjson
from datetime import datetime as dt
from couchdbkit import ResourceNotFound, BulkSaveError
import restkit.errors
from couchdbkit.loaders import FileSystemDocsLoader
from settings import settings
from scoredict import Scores, BUCKETS, log_score, DONE
import lookup
import twitter
from models import *
from utils import grouper, couch, mongo
def design_sync(type):
"sync the documents in _design"
loader = FileSystemDocsLoader(type+'_design')
loader.sync(Model.database, verbose=True)
def stop_lookup():
stalk.use(settings.region+"_lookup_done")
stalk.put('halt',0)
def import_json():
for g in grouper(1000,sys.stdin):
try:
Model.database.bulk_save([json.loads(l) for l in g if l])
except BulkSaveError as err:
if any(d['error']!='conflict' for d in err.errors):
raise
else:
logging.warn("conflicts for %r",[d['id'] for d in err.errors])
def filter_usians(path=None):
file =open(path) if path else sys.stdin
for line in file:
tweet = cjson.decode(line)
if not tweet.get('coordinates'):
continue
lng,lat = tweet['coordinates']['coordinates']
if 24<lat<50 and -126<lng<-66:
print line,
def import_old_json():
for g in grouper(1000,sys.stdin):
docs = [json.loads(l) for l in g if l]
for d in docs:
del d['doc_type']
for k,v in d.iteritems():
if k[-2:]=='id' or k in ('rtt','rtu'):
d[k]=v[1:]
for field in ['ats','fols','frs']:
if field in d and isinstance(d[field],list):
d[field] = [u[1:] for u in d[field]]
Model.database.bulk_save(docs)
def export_json(start=None,end=None):
for d in Model.database.paged_view('_all_docs',include_docs=True,startkey=start,endkey=end):
if d['id'][0]!='_':
del d['doc']['_rev']
print json.dumps(d['doc'])
def export_mongo():
for d in Model.database.paged_view('_all_docs',include_docs=True):
if d['id'][0]!='_':
d = d['doc']
del d['_rev']
for k,v in d.iteritems():
if k[-2:]=='id' or k in ('rtt','rtu'):
d[k]=int(v)
for field in ['ats','fols','frs']:
if field in d and isinstance(d[field],list):
d[field] = [int(u) for u in d[field][:5000]]
print json.dumps(d)
def merge_db(*names,**kwargs):
views = [
connect(name).paged_view(
'_all_docs',
include_docs=True,
startkey=kwargs.get('start'),
endkey=kwargs.get('end'))
for name in names
]
last =None
for row in merge_views(*views):
if row['key']!=last:
del row['doc']['_rev']
print json.dumps(row['doc'])
last = row['key']
def fake_lu_master():
proc = lookup.LookupMaster()
while not proc.halt:
proc.read_scores()
print "scores:%d"%len(proc.scores)
print "halting"
proc.read_scores()
proc.scores.dump(settings.lookup_out)
def fake_lu_slave():
proc = lookup.LookupSlave('y')
Edges.database = CouchDB('http://127.0.0.1:5984/orig_houtx',True)
view = Model.database.paged_view('dev/user_and_tweets',include_docs=True)
for k,g in itertools.groupby(view, lambda r:r['key'][0]):
user_d = g.next()
if user_d['id'][0] != 'U':
print "fail %r"%user_d
continue
user = User(user_d['doc'])
print "scoring %s - %s"%(user._id, user.screen_name)
tweets = [Tweet(r['doc']) for r in g]
if user.local_prob != 1.0:
continue
try:
rels = Edges.get_for_user_id(user._id)
except ResourceNotFound:
print "rels not found"
rels = None
proc.score_new_users(user, rels, tweets)
print "done"
def update_loc():
User.database = connect("houtx_user")
for user in User.get_all():
old = user.local_prob
user.local_prob = lookup.guess_location(user,gisgraphy)
if old != user.local_prob:
user.save()
def merge_views(*views):
# This is based on heapq.merge in python 2.6. The big difference is
# that it sorts by key.
h = []
for itnum, it in enumerate(map(iter, views)):
try:
row = it.next()
h.append([row['key'], itnum, row, it.next])
except StopIteration:
pass
heapq.heapify(h)
while 1:
try:
while 1:
k, itnum, v, next = s = h[0] # raises IndexError when h is empty
yield v
s[2] = next() # raises StopIteration when exhausted
s[0] = s[2]['key']
heapq.heapreplace(h, s) # restore heap condition
except StopIteration:
heapq.heappop(h) # remove empty iterator
except IndexError:
return
def force_lookup(to_db="hou",start_id='',end_id=None):
"Lookup users who were not included in the original crawl."
start ='U'+start_id
end = 'U'+end_id if end_id else 'V'
user_view = Model.database.paged_view('_all_docs',include_docs=True,startkey=start,endkey=end)
users = (User(d['doc']) for d in user_view)
Model.database = connect(to_db)
found_db = connect("houtx")
found_view = found_db.paged_view('_all_docs',startkey=start,endkey=end)
found = set(d['id'] for d in found_view)
scores = Scores()
scores.read(settings.lookup_out)
region = ("Texas","United States")
for user in users:
int_uid = as_int_id(user._id)
if ( user.lookup_done or
user.protected or
int_uid not in scores or
user.local_prob==1 or
(user.local_prob==0 and user.geonames_place.name not in region) or
user._id in found
):
continue
state, rfs, ats = scores.split(int_uid)
if user.utc_offset == -21600:
if log_score(rfs,ats,.9) < 1: continue
else:
if log_score(rfs,ats) < settings.non_local_cutoff: continue
user_lookup(user)
def _users_from_scores():
scores = Scores()
scores.read(settings.lookup_out)
for uid in scores:
state, rfs, ats = scores.split(uid)
if log_score(rfs,ats)>=11:
yield uid
def _users_from_db():
User.database = connect("houtx_user")
return (int(row['id']) for row in User.database.paged_view("_all_docs"))
def fetch_edges():
Edges.database = connect("houtx_edges")
User.database = connect("away_user")
old_edges = set(int(row['id']) for row in Edges.database.paged_view("_all_docs",endkey="_"))
uids = set(_users_from_scores())-old_edges
settings.pdb()
for g in grouper(100,uids):
for user in twitter.user_lookup(g):
if user is None or user.protected: continue
try:
edges = twitter.get_edges(user._id)
except restkit.errors.Unauthorized:
logging.warn("unauthorized!")
continue
except restkit.errors.ResourceNotFound:
logging.warn("resource not found!?")
continue
edges.save()
user.save()
sleep_if_needed()
def stdin_lookup():
from_db = connect("orig_houtx")
for l in sys.stdin:
user = from_db.get_id(User,l.strip())
user_lookup(user)
def user_lookup(user):
tweets = twitter.save_timeline(user._id,last_tid=settings.min_tweet_id)
if not tweets: return
user.last_tid = tweets[0]._id
user.last_crawl_date = dt.utcnow()
user.next_crawl_date = dt.utcnow()
user.tweets_per_hour = settings.tweets_per_hour
user.lookup_done = True
user.attempt_save()
logging.info("saved %d from %s to %s",len(tweets),tweets[-1]._id,tweets[0]._id)
sleep_if_needed()
def fill_50():
settings.pdb()
Tweet.database = couch('hou_new_tweet')
old_db = couch('houtx_tweet')
res = twitter.TwitterResource()
for line in open('logs/missing_uids'):
uid = line.strip()
view = old_db.paged_view('tweet/uid',key=uid)
last = max(int(row['id']) for row in view)
tweets = res.save_timeline(uid, last_tid=last)
logging.info("saved %d for %s",len(tweets),uid)
sleep_if_needed(res)
def sleep_if_needed(twitter):
logging.info("api calls remaining: %d",twitter.remaining)
if twitter.remaining < 10:
delta = (twitter.reset_time-dt.utcnow())
logging.info("goodnight for %r",delta)
time.sleep(delta.seconds)