This repository has been archived by the owner on Mar 4, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
mirror-mirror.py
executable file
·252 lines (186 loc) · 9.15 KB
/
mirror-mirror.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#!/usr/bin/env python3
import requests
def scrape_instagram(username):
#start a new web-browsing session
s = requests.session()
#make containers for all the links to be collected
messy_links, clean_links = [],[]
#download the user's profile page
profile_page = s.get('http://instagram.com/'+username).text
#check if account exists
if '<body class=" p-error dialog-404">' not in profile_page:
#test if the user's account is private
if '"is_private": true' in profile_page:
user_links_page = profile_page.split('"')
#otherwise, get all the other images
else:
profile_page = profile_page.split('"owner": {"id": "')
#get the user's unique user id from the profile page
unique_user_id = profile_page[1][:profile_page[1].index('"')]
#get the dynamically created javascript file's temporary unique pathway
unique_commons_js = profile_page[len(profile_page)-1].split('en_US_Commons.js/')[1]
unique_commons_js = unique_commons_js[:unique_commons_js.index('.js')]
#download the dynamically generated javascript page to get the unique session id
javascript_page = s.get('https://www.instagram.com/static/bundles/en_US_Commons.js/'+unique_commons_js+'.js')
javascript_page = javascript_page.text.split('return e.profilePosts.byUserId.get(t).pagination},queryId:"')[1]
#get the unique session id from the javascript page
unique_session_id = javascript_page[:javascript_page.index('"')]
#using the session and user id's, download the file containing the links to all pictures ever posted
user_links_page = s.get('https://www.instagram.com/graphql/query/?query_id='+unique_session_id+'&id='+unique_user_id+'&first=1000000')
user_links_page = user_links_page.text.split('"')
#collect the url of every possible jpg picture
for link in user_links_page:
if '.jpg' in link:
messy_links.append(link)
#find the uncompressed links to the images provided, and clean them up
for link in messy_links:
segmented_link = link.split('/')
unique_post_id = link[::-1][:link[::-1].index('/')][::-1]
clean_link = 'https://'+segmented_link[2]+'/'+segmented_link[3]+'/'+unique_post_id
if clean_link not in clean_links:
clean_links.append(clean_link)
#terminate the browsing session
s.close()
#return all the decompressed image links
return clean_links
def scrape_twitter(username):
#start a new web-browsing session
s = requests.session()
#make containers for all the links to be collected
messy_links, clean_links = [],[]
#using the user's username, download the file containing the links to the last 3200 pictures ever posted
user_links_page = s.get('https://twitter.com/i/profiles/show/'+username+'/media_timeline.json?count=3200')
user_links_page = user_links_page.text.split('"')
#collect the url of every possible jpg picture, find the uncompressed url
for link in user_links_page:
link = ''.join(link.split('\\'))
if link.endswith('.jpg'):
if link.startswith('https://pbs.twimg.com/media/') or \
link.startswith('https://pbs.twimg.com/tweet_video_thumb/') or \
link.startswith('https://pbs.twimg.com/ext_tw_video_thumb/'):
messy_links.append(link + ':orig')
#make sure there are no duplicate links
for link in messy_links:
if link not in clean_links:
clean_links.append(link)
#check if the account has any pictures associated with it
if len(clean_links) > 0:
#profile picture is compressed regardless, remove it
if 'profile_images' in clean_links[0]:
clean_links.pop(0)
#terminate the browsing session
s.close()
#return all the decompressed image links
return clean_links
def scrape_vsco(username):
#start a new web-browsing session
s = requests.session()
#make containers for all the links to be collected
messy_links, clean_links = [],[]
#download the user's profile page
profile_page = s.get('http://vsco.co/'+username)
#check if account exists
if '<p class="page-error-heading mt40">This page does not exist</p>' not in profile_page.text:
#get the unique session id from the site's cookies
unique_session_id = str(profile_page.cookies).split('vs=')[1]
unique_session_id = unique_session_id[:unique_session_id.index(' ')]
#convert the profile page to a string
profile_page = profile_page.text
#get the user's unique user id from the profile page
unique_user_id = profile_page.split('"id":')[1]
unique_user_id = unique_user_id[:unique_user_id.index(',')]
#find the user's profile picture link
profile_picture_link = profile_page.split('responsive_url":"')[1]
profile_picture_link = profile_picture_link[:profile_picture_link.index('"')]
#add the profile picture link to the list
messy_links.append('http://'+profile_picture_link)
#using the session and user id's, download the file containing the links to all pictures ever posted
user_links_page = s.get('http://vsco.co/ajxp/'+unique_session_id+'/2.0/medias?site_id='+unique_user_id+'&page=1&size=10000').text.split('"')
#collect the url of every possible jpg picture
for link in user_links_page:
if ((('im.vsco.co' in link) and ('.jpg' in link))):
messy_links.append('http://'+link)
#find the uncompressed links to the images provided, and clean them up
for link in messy_links:
clean_links.append(link.replace('\\',''))
#terminate the browsing session
s.close()
#return all the decompressed image links
return clean_links
def scrape_tinder(username):
#start a new web-browsing session
s = requests.session()
#make containers for all the links to be collected
messy_links, clean_links = [],[]
#download the user's profile page
profile_page = s.get('https://www.gotinder.com/@'+username)
#check if account exists
if "<h1 id='title'>Looking for Someone?</h1>" not in profile_page:
#convert the profile page to a list
profile_page = profile_page.text.split('"')
#collect the url of every possible jpg picture
for link in profile_page:
if '.jpg' in link:
messy_links.append(link)
#find the uncompressed links to the images provided, and clean them up
for link in messy_links:
clean_link = '/'.join(link.split('/'))
if clean_link not in clean_links:
clean_links.append(clean_link)
#check if the account has any pictures associated with it
if len(clean_links) > 0:
#remove the picture that's not user related
clean_links.pop(1)
#terminate the browsing session
s.close()
#return all the decompressed image links
return clean_links
def scrape_okcupid(username):
#start a new web-browsing session
s = requests.session()
#make containers for all the links to be collected
messy_links, clean_links = [],[]
#download the user's profile page
profile_page = s.get('https://www.okcupid.com/profile/'+username).text.encode('utf-8')
profile_page = str(str(profile_page).split("'")).split('"')
#check if account exists
if '<title>Account not found | OkCupid</title>' not in str(profile_page):
#collect the url of every possible jpg picture
for link in profile_page:
if ('.jpeg' in link) and ('/images/' in link):
messy_links.append(link)
#choose the first cdn server available to prevent duplicate images
chosen_cdn_server = messy_links[0][:messy_links[0].index('/images')]
#find the uncompressed links to the images provided, and clean them up
for link in messy_links:
clean_link = chosen_cdn_server+'/images/'+link[::-1][:link[::-1].index('/')][::-1]
clean_link = clean_link[::-1][clean_link[::-1].index(".")+1:][::-1]+'.jpeg'
if clean_link not in clean_links:
clean_links.append(clean_link)
#terminate the browsing session
s.close()
#return all the decompressed image links
return clean_links
if __name__ == '__main__':
import sys
if len(sys.argv) < 3:
sys.stderr.write('usage: mirror-mirror.py <service> <username...>\n')
sys.exit(1)
else:
service = sys.argv[1].lower()
if service == 'instagram':
scrape = scrape_instagram
elif service == 'twitter':
scrape = scrape_twitter
elif service == 'vsco':
scrape = scrape_vsco
elif service == 'tinder':
scrape = scrape_tinder
elif service == 'okcupid':
scrape = scrape_okcupid
else:
sys.stderr.write('unknown service: %s\n' % service)
sys.exit(1)
for username in sys.argv[2:]:
for link in scrape(username):
print(link)