-
Notifications
You must be signed in to change notification settings - Fork 1
/
csv_from_alto.py
160 lines (134 loc) · 6.89 KB
/
csv_from_alto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
import xml.etree.ElementTree as ET
import csv
import sys
def get_mets_path(year, month, day) -> str:
return f'{year}{month}{day}{"-METS.xml"}'
def build_mets(mets_root):
mets_data = []
for child in mets_root:
if child.tag == f"{namespace_mets}structMap" and child.get('TYPE') == 'LOGICAL':
divl3 = child.find(f".//{namespace_mets}div[@ID='DIVL3']")
for div in divl3:
if div.get('TYPE') == 'CONTENT':
for d in div:
d_type = d.get('TYPE')
title = d.get('LABEL')
article_id = d.get('ID')
d_body_content = d.find(f".//{namespace_mets}div[@TYPE='BODY_CONTENT']")
begins = []
if d_body_content is not None:
for body_content in d_body_content:
areas = body_content.findall(f".//{namespace_mets}area[@BETYPE='IDREF']")
for area in areas:
begin = area.get('BEGIN')
begins.append(begin)
mets_data.append({
"article_id" : article_id,
"begins" : begins,
"title" : title,
"type" : d_type,
"page" : begins[0][1],
"text": ""
})
else:
print(f"Article {article_id} does not have content and was skipped.")
return mets_data
def build_texts_blocks(alto_path):
texts_blocks = []
for alto_file in os.listdir(alto_path):
alto_file_path = os.path.join(alto_path, alto_file)
with open(alto_file_path, 'r', encoding='utf-8') as alto:
tree = ET.parse(alto)
alto_root = tree.getroot()
#layout = next((alto_root.find(f".//{ns}PrintSpace") for ns in namespace_alto if alto_root.find(f".//{ns}Layout") is not None), None)
for namespace_alto in namespaces_alto:
layout = alto_root.find(f".//{namespace_alto}Layout")
if layout is None:
continue
printSpace = layout.find(f".//{namespace_alto}PrintSpace")
for textBlock in printSpace:
if textBlock.tag == f"{namespace_alto}TextBlock":
texts_blocks.append(textBlock)
if len(texts_blocks) == 0:
raise Exception("Error find TextBlocks. Verify that the namespaces is inserted to namespaces array.")
return texts_blocks
def find_text_block(begin, texts_blocks):
for text_block in texts_blocks:
if text_block.get("ID") == begin:
return text_block
return None
def build_text(text_block) -> str:
def last_iteration(text_block_idx, text_lines_idx):
return text_block_idx == len(text_block) - 1 and text_lines_idx == len(text_block[text_block_idx]) - 1
text = ""
for text_block_idx, text_lines in enumerate(text_block):
for text_lines_idx, text_line in enumerate(text_lines):
for namespace_alto in namespaces_alto:
if text_line.tag == f"{namespace_alto}String":
text += text_line.get("CONTENT")
elif text_line.tag == f"{namespace_alto}SP":
text += " "
if last_iteration(text_block_idx, text_lines_idx):
if not text_line.tag == f"{namespace_alto}HYP":
text += " "
text += " "
return text
def write_to_csv(data, csv_name):
output_file_path = f'{csv_name}.csv'
header_to_remove = "begins"
with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = [key for key in data[0].keys() if key != header_to_remove]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for item in data:
if header_to_remove in item:
del item[header_to_remove]
writer.writerow(item)
def break_code():
print("Error: Missing required argument.")
print("Usage: python run.py <folder_name>")
sys.exit(1)
if __name__ == '__main__':
directory_path = 'Alto_Samples'
if len(sys.argv) > 1:
directory_path = sys.argv[1]
else:
break_code()
namespaces = {'mets': 'http://www.loc.gov/METS/'}
namespace_mets = "{http://www.loc.gov/METS/}"
namespaces_alto = [
"{http://www.loc.gov/standards/alto/ns-v3#}",
"{http://schema.ccs-gmbh.com/ALTO}"
]
csv_data = []
for newspaper_folder in os.listdir(directory_path):
newspaper_folder_path = os.path.join(directory_path, newspaper_folder)
if os.path.isdir(newspaper_folder_path):
for year in os.listdir(newspaper_folder_path):
year_path = os.path.join(newspaper_folder_path, year)
if os.path.isdir(year_path):
for month in os.listdir(year_path):
month_path = os.path.join(year_path, month)
if os.path.isdir(month_path):
for day in os.listdir(month_path):
day_path = os.path.join(month_path, day)
if os.path.isdir(day_path):
mets_name = get_mets_path(year, month, day)
mets_path = os.path.join(day_path, mets_name)
print(day_path)
with open(mets_path, 'r', encoding='utf-8') as mets:
tree = ET.parse(mets)
mets_root = tree.getroot()
mets_data = build_mets(mets_root)
alto_path = os.path.join(day_path, "ALTO")
texts_blocks = build_texts_blocks(alto_path)
for m in mets_data:
text = ""
for begin in m["begins"]:
text_block = find_text_block(begin, texts_blocks)
if text_block is not None:
text += build_text(text_block)
m["text"] = text
csv_name = f"output/{newspaper_folder}-{year}-{month}-{day}"
write_to_csv(mets_data, csv_name)