-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect-from-snapzilla.sh
53 lines (41 loc) · 1.67 KB
/
collect-from-snapzilla.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/bash
# show commands and stop if there is an error
set -ex
# make the directory if it is not there
mkdir -p downloads
# clean the directory if there are old results
rm -f downloads/*
#SEARCH_STRING="robot"
#Create a array of keywords to use in the scrape.
#This allows me to scrape images for a set of keywords instead of just one.
#I needed to implement this as the search functionality of the site I am scraping
#was not complex enough to allow for "OR" based searches.
declare -a arr=("guitar" "piano" "keyboard" "turntable" "violin" "cello" "saxophone" "trumpet" "bass" "horn" "drum" "tuba" "harp")
for i in "${arr[@]}"
do
# get 20 pages
for PAGE in {1..32}
do
# build the url
URL='http://www.sluniverse.com/snapzilla/Home/Search?term='$i'&page='$PAGE'#pictures'
# fetch the images
wget --adjust-extension \
--random-wait \
--limit-rate=100k \
--span-hosts \
--convert-links \
--backup-converted \
--no-directories \
--timestamping \
--page-requisites \
--directory-prefix=downloads \
--execute robots=off \
--accept=*jpg*,*png* \
$URL
# other unused arguments
# --recursive \
# --level 1 \
# --domains en.wikipedia.org \
done
done
# URL='http://www.trademe.co.nz/Browse/SearchResults.aspx?&cid=0&searchType=&searchString='$SEARCH_STRING'&x=0&y=0&type=Search&sort_order=&redirectFromAll=False&rptpath=all&page='$PAGE'&user_region=100&user_district=0&generalSearch_keypresses=8&generalSearch_suggested=0&generalSearch_suggestedCategory='