-
Notifications
You must be signed in to change notification settings - Fork 1
/
check_image_refs.py
executable file
·131 lines (101 loc) · 3.58 KB
/
check_image_refs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#
# Python script that checks for all the images
#
from html.parser import HTMLParser
from os import listdir, walk
from os.path import isfile, join
# List of image references from the html pages
# list_of_img_refs = []
list_of_img_refs = {}
def isImage(imgref):
""" Determine if a string is an image, which in our case means that
it ends with jpg, JPG, png, or gif """
if (imgref.endswith("JPG")):
return True
if (imgref.endswith("jpg")):
return True
if (imgref.endswith("gif")):
return True
if (imgref.endswith("png")):
return True
return False
def get_img_ref_from_attrs(attrs, filename):
""" the attributes of an html tag come in a list. We want
either the src or the href. For example, a tag may be:
<img src="advertising/elsiecopies1r.jpg" alt="elsie copies" width="200">
attrs is now:
[('src', 'advertising/elsiecopies1r.jpg'),
('alt', 'elsie copies'),
('width', '200')]
And we want the src one.
"""
for attr in attrs:
if attr[0] == 'src':
if isImage(attr[1]):
list_of_img_refs[attr[1]] = filename
if attr[0] == 'href':
if isImage(attr[1]):
list_of_img_refs[attr[1]] = filename
class CowHTMLParser(HTMLParser):
def __init__(self, filename):
super().__init__()
self.filename = filename
def handle_starttag(self, tag, attrs):
if tag.startswith("img"):
get_img_ref_from_attrs(attrs, filename)
if tag.startswith("a"):
get_img_ref_from_attrs(attrs, filename)
def handle_endtag(self, tag):
pass
def handle_data(self, data):
pass
def error(self, message):
pass
def get_files_from_dir(dirname, type=None):
""" This gets all the files in a directory, and optionally limits to
a particular type of file """
images = [filename for filename in listdir(dirname) if isfile(join(dirname, filename))]
if type is not None:
images = [filename for filename in images if filename.endswith(type)]
return images
# Get all the html files.
htmlfiles = get_files_from_dir(".", "html")
# Go through all the html files
for filename in htmlfiles:
# Go through the file and get the list of images, will get added to list_of_img_refs.
file = open(filename, "r")
line = file.readline()
parser = CowHTMLParser(filename)
while line:
parser.feed(line)
line = file.readline()
file.close()
# Turn the list into a set, which will remove all repeats
refSet = set(list_of_img_refs.keys())
# print(" All image refences from HTML files: "+ str(len(refSet)))
# print(refSet)
# print()
# Go through all the images in all the directories
images = []
for root, dirs, files in walk("."):
for name in files:
fullname = join(root, name)[2:]
# print("file " + join(root, name) + " " + fullname)
if isImage(fullname):
images.append(fullname)
imageSet = set(images)
# print(" All image from all directories : " + str(len(imageSet)))
# Compare them.
imgSetCopy = set(imageSet)
imgSetCopy = imgSetCopy.difference(refSet)
print(" Images in the image directory that are NOT in the html: " + str(len(imgSetCopy)))
print(str(imgSetCopy))
print()
refSetCopy = set(refSet)
refSetAsList = list(refSetCopy.difference(imageSet))
refSetAsList.sort()
print(" Images in the html that are NOT in the image directory: " + str(len(refSetAsList)))
print(str(refSetAsList))
for img in refSetAsList:
print(f"\t{img:20} from {list_of_img_refs[img]}")
print()