-
Notifications
You must be signed in to change notification settings - Fork 183
/
extract-metadata.py
executable file
·108 lines (90 loc) · 3.24 KB
/
extract-metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python3
import os
import os.path
import re
import sys
import xml
import xml.sax
def extract_md(filename):
try:
with open(filename, "r") as fh:
section_header = fh.readline().strip()
if section_header != r"%%%" and section_header != r"---":
raise Exception(
'Unexpected first line in markdown file: got "{section_header}", expected `%%%` or `---`'
)
header_data = ""
for line in fh:
if line.strip() == section_header:
break
header_data += line
if section_header == r"---":
try:
import yaml
return next(yaml.safe_load_all(header_data))
except ImportError as err:
raise Exception(
"Unable to import python `yaml` library, needed for Kramdown processing"
) from err
if section_header == r"%%%":
try:
import toml
return toml.loads(header_data)
except ImportError as err:
raise Exception(
"Unable to import python `toml` library, needed for Mmark processing"
) from err
except Exception as err:
return {}
def extract_xml(filename):
parser = xml.sax.make_parser()
handler = XmlHandler()
parser.setContentHandler(handler)
parser.parse(filename)
return handler.metadata
class XmlHandler(xml.sax.handler.ContentHandler):
interesting_elements = ["title", "area", "workgroup"]
wsp = re.compile(r"\s+")
def __init__(self):
self.metadata = {}
self.stack = []
self.content = ""
self.attrs = {}
self.in_front = False
def startElement(self, name, attrs):
self.stack.append(name)
self.attrs = attrs
if self.stack == ["rfc", "front"]:
self.in_front = True
def endElement(self, name):
pop_name = self.stack.pop()
assert name == pop_name
if self.in_front and pop_name == "front":
self.in_front = False
if self.in_front and name in self.interesting_elements:
if name == "title" and self.attrs.get("abbrev", "").strip() != "":
self.metadata["abbrev"] = self.attrs["abbrev"]
self.metadata[name] = self.wsp.sub(" ", self.content.strip())
self.content = ""
self.attrs = {}
def characters(self, data):
self.content += data
def processingInstruction(self, target, data):
self.metadata[target.strip()] = data.strip()
extract_funcs = {".md": extract_md, ".xml": extract_xml}
if __name__ == "__main__":
filename = sys.argv[1]
target = sys.argv[2]
if os.path.isfile(filename):
fileext = os.path.splitext(filename)[1]
extract_func = extract_funcs.get(fileext, lambda a: {})
metadata = extract_func(filename)
if target == "abbrev":
value = metadata.get("abbrev", None)
if value == None:
value = metadata.get("title", "")
else:
value = metadata.get(target, "")
else:
value = ""
print(value)