-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwordoff.py
74 lines (55 loc) · 2.32 KB
/
wordoff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# remove cruft that has been added through pasting from word
# into a wysiwyg editor
import re
import unicodedata
match_tags_with_attributes = re.compile(r'<([a-zA-Z]+[0-9]?) ([^/>]*)(/?)>')
match_spans = re.compile(r'</?span[^>]*>')
match_divs = re.compile(r'</?div[^>]*>')
match_empty_elements = re.compile(r'<([a-zA-Z]+)>\s*</\1>')
match_multiple_linebreaks = re.compile(r'(\n\s*){3,}')
match_word_namespaced_elements = re.compile(r'<([wo]:[a-zA-Z]+)( [^>]+)?>.*</\1>', re.DOTALL)
match_empty_word_namespaced_elements = re.compile(r'<([wo]:[a-zA-Z]+)( [^>]+)?/>')
match_style_attributes = re.compile(r'\s*style\s*=\s*[\'"][^\'"]+[\'"]\s*')
def ignore_some_tags(matchobj):
# only strip style attributes for <a>s or <img>s
if matchobj.group(1) in ['a', 'img']:
return match_style_attributes.sub(' ', matchobj.group(0))
# but strip all attibrutes for everything else
else:
if matchobj.group(3): # preserve trailing slashes for void elements
return '<%s />' % matchobj.group(1)
return '<%s>' % matchobj.group(1)
def stripAttributes(str):
# remove attributes from all tags
return match_tags_with_attributes.sub(ignore_some_tags, str)
def stripSpans(str):
# remove spans
return match_spans.sub('', str)
def stripDivs(str):
# remove divs
return match_divs.sub('', str)
def reduceLineBreaks(str):
# reduce >2 line breaks into two
return match_multiple_linebreaks.sub('\n\n', str)
def stripEmptyElements(str):
# remove elements which contain nothing or whitespace
return match_empty_elements.sub('', str)
def strip_word_namespaced_elements(str):
# remove elements which are w: namespaced
str = match_word_namespaced_elements.sub('', str)
return match_empty_word_namespaced_elements.sub('', str)
def xenophobia(str):
# convert everything to ascii
utf8_string = str.decode('utf-8')
return unicodedata.normalize('NFKD', utf8_string).encode('ascii', 'ignore')
def superClean(str):
clean = stripAttributes(str)
cleaner = stripSpans(clean)
cleaner = stripDivs(cleaner)
cleaner = strip_word_namespaced_elements(cleaner)
#cleaner = xenophobia(cleaner)
cleaner = stripEmptyElements(cleaner)
cleaner = stripEmptyElements(cleaner)
cleaner = stripEmptyElements(cleaner)
cleaner = reduceLineBreaks(cleaner)
return cleaner