-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
141 lines (113 loc) · 4.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
__author__ = 'Danijel'
import requests
import urllib.request
from bs4 import BeautifulSoup
# ------ Functions ------
#depending on development method of certain site, pictures can be stored in multiple ways
'''
This method connects image url with site url in order to get real image link
example:
for website -> http://www.awwwards.com/awards-of-the-month/
retrieved image url is -> /bundles/tvawwwards/images/pages/img-lovedays.png
as a result you get -> http://www.awwwards.com/awards-of-the-month//bundles/tvawwwards/images/pages/img-lovedays.png
'''
def getAllImageLinksMethod1(url):
print('\n****** Trying to download image via method 1 ******')
counter = 0
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.find_all('img'):
imagelink = url + link.get('src')
try:
downloadWebImage(imagelink)
print('Picture: ', imagelink, ' is successfully downloaded :)')
counter +=1
except:
print('Error downloading', imagelink.split("/")[-1], '-.-')
print('\nNumber of downloaded images: ', counter)
'''
Some pages already contain direct source to image
for website -> http://www.awwwards.com/awards-of-the-month/
retrieved image url is -> /bundles/tvawwwards/images/pages/img-lovedays.png
as a result you get -> /bundles/tvawwwards/images/pages/img-lovedays.png
This doesn't work of course, but in some cases result link is a real link
'''
def getAllImageLinksMethod2(url):
print('\n****** Trying to download image via method 2 ******')
counter = 0
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.find_all('img'):
imagelink = link.get('src')
try:
downloadWebImage(imagelink)
print('Picture: ', imagelink, ' is successfully downloaded :)')
counter +=1
except:
print('Error downloading', imagelink.split("/")[-1], '-.-')
print('\nNumber of downloaded images: ', counter)
'''
This method is for sites that store all images in subdirectories
for website -> http://www.awwwards.com/awards-of-the-month/
retrieved image url is -> /bundles/tvawwwards/images/pages/img-lovedays.png
we remove the part after .com
result url is -> http://www.awwwards.com/bundles/tvawwwards/images/pages/img-lovedays.png
'''
def getAllImageLinksMethod3(url):
print('\n****** Trying to download image via method 3 ******')
counter = 0
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.find_all('img'):
imagelink = link.get('src')
website = str(url)
realImageLink = 'http://' + website.split("/")[2] + imagelink
try:
downloadWebImage(realImageLink)
print('Picture: ', realImageLink, ' is successfully downloaded :)')
counter +=1
except:
print('Error downloading', realImageLink.split("/")[-1], '-.-')
print('\nNumber of downloaded images: ', counter)
'''
This method if for rare cases like polleosport.hr
Their website doesn't use www prefiks, so that part is removed and retrieved link looks like http://[pageURL]/+ link
'''
def getAllImageLinksMethod4(url):
print('\n****** Trying to download image via method 4 ******')
counter = 0
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.find_all('img'):
imagelink = link.get('src')
website = str(url)
realImageLink = ""
try:
realImageLink = 'http://' + website.split("www.")[1] + '/' + imagelink
except:
print("Error retrieving image link")
try:
downloadWebImage(realImageLink)
print('Picture: ', realImageLink, ' is successfully downloaded :)')
counter +=1
except:
print('Error downloading', realImageLink.split("/")[-1], '-.-')
print('\nNumber of downloaded images:: ', counter)
def downloadWebImage(url):
filename = url.split("/")[-1]
urllib.request.urlretrieve(url,filename)
# ------ End function ------
# ------ Main ------
print('This is a really simple image downloading program')
print('Enter website URL in the following format:')
print('http://www.pageURL.com|hr|org etc. \n')
print('or try example: https://stupid-studio.com \nor \nhttp://www.hyperquake.com/work/old-spice-gallery')
linkWebStraniceInput = str(input('Enter website url: '))
getAllImageLinksMethod1(linkWebStraniceInput)
getAllImageLinksMethod2(linkWebStraniceInput)
getAllImageLinksMethod3(linkWebStraniceInput)
getAllImageLinksMethod4(linkWebStraniceInput)