-
Notifications
You must be signed in to change notification settings - Fork 21
/
scrape.py
82 lines (71 loc) · 2.86 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Scrape images linked to from specified subreddits."""
try:
import praw
except ImportError:
print "Unable to find praw. see https://github.com/praw-dev/praw"
raise()
from time import sleep
from urllib import urlopen
import os
import sys
import datetime
import settings as settings_
import string
_REDDIT_API_SLEEP_TIME = 2.50
_VALID_CHARS = frozenset(''.join(("-_.() ", string.ascii_letters, string.digits)))
def sanitize(s, default_name="image"):
sanitized = ''.join(c for c in s if c in _VALID_CHARS)
return sanitized if sanitized else default_name # Use default if string is empty.
def unique(filename):
"""Return a guaranteed-unique version of a given filename."""
if not os.path.exists(filename):
return filename
else:
parts = filename.split('.')
parts.insert(-1, '%d') # Put a number before the extension.
filename_fmt = '.'.join(parts)
num = 0
while os.path.exists(filename_fmt % num):
num += 1
return filename_fmt % num
def download_and_save(url, filename):
"""Save the data at a given URL to a given local filename."""
data = urlopen(url).read()
with open(filename, mode='w') as output:
output.write(data)
def fetch_image(submission, directory):
votes = '+%s,-%s' % (submission.ups, submission.downs)
url = submission.url
extension = url.split('.')[-1]
title = sanitize(submission.title) # Remove illegal characters
if title.endswith('.'): title = title[:-1] # Fix foo..jpg
local_filename = unique(os.path.join(directory, '%s.%s' % (title, extension)))
download_and_save(url, local_filename)
def scrape(settings, include_sub=None, include_dir=None):
r = praw.Reddit(user_agent=settings.user_agent)
for grouping in settings.groupings:
if ((include_dir is not None and grouping.name not in include_dir) or
not grouping.enabled):
continue
for subreddit in grouping.subreddits:
if ((include_sub is not None and subreddit.name not in include_sub) or
not subreddit.enabled):
continue
dirname = grouping.dirname_for(subreddit)
if not os.path.exists(dirname):
os.makedirs(dirname)
yield dirname
extensions = set(subreddit.file_types)
limit = subreddit.num_files
submissions = r.get_subreddit(subreddit.name).get_top_from_day(limit=limit)
count = 0
for sub in submissions:
url = sub.url
if any(sub.url.lower().endswith(ext.lower()) for ext in extensions):
fetch_image(sub, dirname)
count += 1
yield count
sleep(_REDDIT_API_SLEEP_TIME) # Avoid offending the Reddit API Gods!
if __name__ == '__main__':
settings = settings_.Settings()
list(scrape(settings))