-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintroduction copy.py
287 lines (224 loc) · 9.13 KB
/
introduction copy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
"""
This is code for the introduction chapter. As such, it stands alone
and won't be used anywhere else in the book.
"""
# type: ignore
import tabulate
DEBUG = False
# a bit from https://nyu-cds.github.io/courses/exercises/Basic-python-1/
banded_birds = [['A1', 28], ['A2', 32], ['A3', 1], ['A4', 0],
['A5', 10], ['A6', 22], ['A7', 30], ['A8', 19],
['B1', 145], ['B2', 27], ['B3', 36], ['B4', 25],
['B5', 9], ['B6', 38], ['B7', 21], ['B8', 12],
['C1', 122], ['C2', 87], ['C3', 36], ['C4', 3],
['D1', 0], ['D2', 5], ['D3', 55], ['D4', 62],
['D5', 98], ['D6', 32]]
birds_questions = [
["Q","A"],
["How many sites are there?", None],
["How many birds were counted at the 7th site?", None],
["How many birds were counted at the last site?", None],
["What is the total number of birds counted across all sites?", None],
["What is the average number of birds seen on a site?", None],
["How many birds were counted on sites with codes beginning with C?", None]
]
assert banded_birds[0][1] == 28, "the first item in banded_birds was unexpected"
assert len(birds_questions) == 7, "Wrong number of birds_questions"
users = [
{ "id": 0, "name": "Hero" },
{ "id": 1, "name": "Dunn" },
{ "id": 2, "name": "Sue" },
{ "id": 3, "name": "Chi" },
{ "id": 4, "name": "Thor" },
{ "id": 5, "name": "Clive" },
{ "id": 6, "name": "Hicks" },
{ "id": 7, "name": "Devin" },
{ "id": 8, "name": "Kate" },
{ "id": 9, "name": "Klein" }
]
friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
(4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
# Initialize the dict with an empty list for each user id:
friendships = {user["id"]: [] for user in users}
# And loop over the friendship pairs to populate it:
for i, j in friendship_pairs:
friendships[i].append(j) # Add j as a friend of user i
friendships[j].append(i) # Add i as a friend of user j
def number_of_friends(user):
"""How many friends does _user_ have?"""
user_id = user["id"]
friend_ids = friendships[user_id]
return len(friend_ids)
total_connections = sum(number_of_friends(user)
for user in users) # 24
assert total_connections == 24
num_users = len(users) # length of the users list
avg_connections = total_connections / num_users # 24 / 10 == 2.4
assert num_users == 10
assert avg_connections == 2.4
# Create a list (user_id, number_of_friends).
num_friends_by_id = [(user["id"], number_of_friends(user))
for user in users]
num_friends_by_id.sort( # Sort the list
key=lambda id_and_friends: id_and_friends[1], # by num_friends
reverse=True) # largest to smallest
# Each pair is (user_id, num_friends):
# [(1, 3), (2, 3), (3, 3), (5, 3), (8, 3),
# (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]
assert num_friends_by_id[0][1] == 3 # several people have 3 friends
assert num_friends_by_id[-1] == (9, 1) # user 9 has only 1 friend
def foaf_ids_bad(user):
"""foaf is short for "friend of a friend" """
return [foaf_id
for friend_id in friendships[user["id"]]
for foaf_id in friendships[friend_id]]
[0, 2, 3, 0, 1, 3]
assert foaf_ids_bad(users[0]) == [0, 2, 3, 0, 1, 3]
if DEBUG:
print(friendships[0]) # [1, 2]
print(friendships[1]) # [0, 2, 3]
print(friendships[2]) # [0, 1, 3]
assert friendships[0] == [1, 2]
assert friendships[1] == [0, 2, 3]
assert friendships[2] == [0, 1, 3]
from collections import Counter # not loaded by default
def friends_of_friends(user):
user_id = user["id"]
return Counter(
foaf_id
for friend_id in friendships[user_id] # For each of my friends,
for foaf_id in friendships[friend_id] # find their friends
if foaf_id != user_id # who aren't me
and foaf_id not in friendships[user_id] # and aren't my friends.
)
if DEBUG:
print(friends_of_friends(users[3])) # Counter({0: 2, 5: 1})
assert friends_of_friends(users[3]) == Counter({0: 2, 5: 1})
interests = [
(0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
(0, "Spark"), (0, "Storm"), (0, "Cassandra"),
(1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
(1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
(2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
(3, "statistics"), (3, "regression"), (3, "probability"),
(4, "machine learning"), (4, "regression"), (4, "decision trees"),
(4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
(5, "Haskell"), (5, "programming languages"), (6, "statistics"),
(6, "probability"), (6, "mathematics"), (6, "theory"),
(7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
(7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
(8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
(9, "Java"), (9, "MapReduce"), (9, "Big Data")
]
def data_scientists_who_like(target_interest):
"""Find the ids of all users who like the target interest."""
return [user_id
for user_id, user_interest in interests
if user_interest == target_interest]
from collections import defaultdict
# Keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)
for user_id, interest in interests:
user_ids_by_interest[interest].append(user_id)
# Keys are user_ids, values are lists of interests for that user_id.
interests_by_user_id = defaultdict(list)
for user_id, interest in interests:
interests_by_user_id[user_id].append(interest)
def most_common_interests_with(user):
return Counter(
interested_user_id
for interest in interests_by_user_id[user["id"]]
for interested_user_id in user_ids_by_interest[interest]
if interested_user_id != user["id"]
)
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
(48000, 0.7), (76000, 6),
(69000, 6.5), (76000, 7.5),
(60000, 2.5), (83000, 10),
(48000, 1.9), (63000, 4.2)]
# Keys are years, values are lists of the salaries for each tenure.
salary_by_tenure = defaultdict(list)
for salary, tenure in salaries_and_tenures:
salary_by_tenure[tenure].append(salary)
# Keys are years, each value is average salary for that tenure.
average_salary_by_tenure = {
tenure: sum(salaries) / len(salaries)
for tenure, salaries in salary_by_tenure.items()
}
assert average_salary_by_tenure == {
0.7: 48000.0,
1.9: 48000.0,
2.5: 60000.0,
4.2: 63000.0,
6: 76000.0,
6.5: 69000.0,
7.5: 76000.0,
8.1: 88000.0,
8.7: 83000.0,
10: 83000.0
}
{0.7: 48000.0,
1.9: 48000.0,
2.5: 60000.0,
4.2: 63000.0,
6: 76000.0,
6.5: 69000.0,
7.5: 76000.0,
8.1: 88000.0,
8.7: 83000.0,
10: 83000.0}
def tenure_bucket(tenure):
if tenure < 2:
return "less than two"
elif tenure < 5:
return "between two and five"
else:
return "more than five"
# Keys are tenure buckets, values are lists of salaries for that bucket.
salary_by_tenure_bucket = defaultdict(list)
for salary, tenure in salaries_and_tenures:
bucket = tenure_bucket(tenure)
salary_by_tenure_bucket[bucket].append(salary)
# Keys are tenure buckets, values are average salary for that bucket
average_salary_by_bucket = {
tenure_bucket: sum(salaries) / len(salaries)
for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}
{'between two and five': 61500.0,
'less than two': 48000.0,
'more than five': 79166.66666666667}
assert average_salary_by_bucket == {
'between two and five': 61500.0,
'less than two': 48000.0,
'more than five': 79166.66666666667
}
def predict_paid_or_unpaid(years_experience):
if years_experience < 3.0:
return "paid"
elif years_experience < 8.5:
return "unpaid"
else:
return "paid"
interests = [
(0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
(0, "Spark"), (0, "Storm"), (0, "Cassandra"),
(1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
(1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
(2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
(3, "statistics"), (3, "regression"), (3, "probability"),
(4, "machine learning"), (4, "regression"), (4, "decision trees"),
(4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
(5, "Haskell"), (5, "programming languages"), (6, "statistics"),
(6, "probability"), (6, "mathematics"), (6, "theory"),
(7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
(7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
(8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
(9, "Java"), (9, "MapReduce"), (9, "Big Data")
]
words_and_counts = Counter(word
for user, interest in interests
for word in interest.lower().split())
if DEBUG:
for word, count in words_and_counts.most_common():
if count > 1:
print(word, count)