-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_query_paper.py
25 lines (22 loc) · 1 KB
/
extract_query_paper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import xml.etree.ElementTree as ET
import os
def get_content_dict(directory):
content_dict = {}
# Iterate over all XML files in the directory
for filename in os.listdir(directory):
if filename.endswith('.xml'):
# Parse the XML file
tree = ET.parse(os.path.join(directory, filename))
root = tree.getroot()
# Extract the contents under each heading
current_heading = None
for elem in root.iter():
if elem.tag == '{http://www.tei-c.org/ns/1.0}head':
if elem.text is not None:
current_heading = elem.text.strip()
if current_heading not in content_dict:
content_dict[current_heading] = []
elif elem.tag == '{http://www.tei-c.org/ns/1.0}p' and current_heading:
if elem.text is not None:
content_dict[current_heading].append(elem.text.strip())
return content_dict