-
Notifications
You must be signed in to change notification settings - Fork 0
/
export_documents_alto.py
128 lines (110 loc) · 4.29 KB
/
export_documents_alto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import json
import logging
import os
from typing import Any
from xml.etree import ElementTree
from tqdm import tqdm
from TkbsDocument import Document
from utilities import add_transkribus_args, find_existing, gather_document_folders, init_tkbs_connection, load_document, save_job_indication, setup_logging, setup_parser
from TranskribusPyClient import TranskribusClient
params="""
{ "commonPars" : {
"pages" : "1",
"doExportDocMetadata" : true,
"doWriteMets" : true,
"doWriteImages" : true,
"doExportPageXml" : true,
"doExportAltoXml" : true,
"doExportSingleTxtFiles" : false,
"doWritePdf" : true,
"doWriteTei" : false,
"doWriteDocx" : false,
"doWriteOneTxt" : false,
"doWriteTagsXlsx" : false,
"doWriteTagsIob" : false,
"doWriteTablesXlsx" : false,
"doWriteStructureInMets" : true,
"doCreateTitle" : false,
"useVersionStatus" : "Latest version",
"writeTextOnWordLevel" : false,
"doBlackening" : false,
"selectedTags" : [ "add", "date", "Address", "human_production", "supplied", "work", "unclear", "sic", "structure", "div", "highlight", "place1", "regionType", "speech", "person", "gap", "organization", "comment", "abbrev", "place", "add1", "Initial", "lat" ],
"font" : "FreeSerif",
"splitIntoWordsInAltoXml" : true,
"pageDirName" : "page",
"fileNamePattern" : "${filename}",
"useHttps" : true,
"remoteImgQuality" : "orig",
"doOverwrite" : true,
"useOcrMasterDir" : true,
"exportTranscriptMetadata" : true,
"updatePageXmlImageDimensions" : false
},
"altoPars" : {
"splitIntoWordsInAltoXml" : true
},
"pdfPars" : {
"doPdfImagesOnly" : false,
"doPdfImagesPlusText" : true,
"doPdfWithTextPages" : false,
"doPdfWithTags" : false,
"doPdfWithArticles" : true,
"doPdfA" : false,
"pdfImgQuality" : "view"
},
"docxPars" : {
"doDocxWithTags" : false,
"doDocxPreserveLineBreaks" : false,
"doDocxForcePageBreaks" : false,
"doDocxMarkUnclear" : false,
"doDocxKeepAbbrevs" : false,
"doDocxExpandAbbrevs" : false,
"doDocxSubstituteAbbrevs" : false,
"doDocxWriteFilenames" : false,
"doDocxIgnoreSuppliedTag" : false,
"doDocxShowSuppliedTagWithBrackets" : false
}
}
"""
def get_args():
parser = setup_parser()
add_transkribus_args(parser)
args = parser.parse_args()
return args
def lines_in_doc(doc: dict):
return doc['md']['nrOfLines'] > 0
def main():
print('Running export to alto started')
logging.debug('Running export to alto started')
args = get_args()
setup_logging(args)
tkbs = TranskribusClient(sServerUrl=args.tkbs_server)
tkbs.auth_login(args.tkbs_user, args.tkbs_password, True)
print(f'Running export to alto documents from Trankribus collection {args.tkbs_collection_id}')
logging.info(f'Running export to alto on all documents from Trankribus collection {args.tkbs_collection_id}')
existing_docs = tkbs.listDocsByCollectionId(args.tkbs_collection_id)
jobs_issued = skipped = missing = 0
folders = list(gather_document_folders(args.base))
for folder in tqdm(folders):
doc = load_document(folder)
existing = find_existing(doc, existing_docs)
if not existing:
print(f"Can't locate document for {folder}, skipping")
logging.warning(f"Can't locate document for {folder}, skipping")
missing += 1
continue
tkbs_doc_id = int(existing['docId'])
output_folder = os.path.join(folder, 'transkribus_alto_output')
if os.path.exists(output_folder):
if not args.overwrite:
print(f'Skipping {doc.title}, it already has a transkribus output')
logging.info(f'Skipping {doc.title}, it already has a transkribus output')
skipped += 1
continue
logging.info(f'Starting export on document {doc.title}')
job_id = tkbs.exportCollection(args.tkbs_collection_id, tkbs_doc_id, params)
save_job_indication(folder, job_id, "job-status-export-alto.json")
jobs_issued += 1
print(f'Done, {jobs_issued} jobs issued, {missing} documents missing, {skipped} documents skipped')
if __name__ == '__main__':
main()