-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathpdfst.ml
323 lines (311 loc) · 13.6 KB
/
pdfst.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
(** Operations on structure trees. *)
open Pdfutil
(* NB. This is very tightly integrated into pdfmerge.ml/pdfpage.ml with all
sorts of phase-order intricacies. Beware. *)
(* Future structure tree merging/trimming work:
o Stamping with -stamp-on, -stamp-under, -combine-pages needs to renumber
MCIDs on each page in some way.
o What should we do with IDTree, RoleMap, ClassMap, Namespaces? Simple
mechanisms work for all known examples, but known examples are few.
o Do we need to resurrect nulling of references to deleted annots when
trimming, for example, so they don't accidentally get included due to being
referenced from the parent tree? This is not a correctness issue, but a
space one. *)
(* Recursion between modules *)
let endpage = ref (fun _ -> 0)
(* Remove any structure tree node (and therefore its children) which has a page
number pointing to a page not to be included in the output. This should be a
reasonable first approximation to the required behaviour. Pdfpage.pdf_of_pages
immediately after making the copied PDF. *)
let trim_structure_tree pdf range =
let page_objnums_to_remove =
let objnums = Pdf.page_reference_numbers pdf in
map (fun x -> List.nth objnums (x - 1)) (setminus (ilist 1 (!endpage pdf)) range)
in
if page_objnums_to_remove = [] then () else
(* Calculate initial deletions - any object with /Pg not in range. *)
let del = ref [] in
Pdf.objiter
(fun n o ->
match Pdf.direct pdf o with
| Pdf.Dictionary d ->
begin match List.assoc_opt "/Pg" d with
| Some (Pdf.Indirect i) ->
if mem i page_objnums_to_remove then del := n::!del
| _ -> ()
end
| _ -> ())
pdf;
(* Any /K referencing these deleted objects is modifed to no longer reference it. *)
let replaceobjs = ref [] in
while !del <> [] || !replaceobjs <> [] do
(*Printf.printf "Top of loop. %i to remove, %i to replace\n" (length (setify_large !del)) (length (setify_large !replaceobjs));
iter (fun x -> Printf.printf "Removing %s\n" (Pdfwrite.string_of_pdf (Pdf.lookup_obj pdf x))) (setify_large !del);*)
iter (Pdf.removeobj pdf) (setify_large !del);
del := [];
(*iter
(fun (x, y) -> Printf.printf "Replacing %s with\n %s\n" x y)
(map (fun (n, r) -> (Pdfwrite.string_of_pdf (Pdf.lookup_obj pdf n), Pdfwrite.string_of_pdf r)) (setify_large !replaceobjs));*)
iter (Pdf.addobj_given_num pdf) (setify_large !replaceobjs);
replaceobjs := [];
Pdf.objiter
(fun n o ->
let process objs =
let survives = function
| Pdf.Indirect i ->
(* Must a) still exist in this file *)
Pdf.lookup_obj pdf i <> Pdf.Null &&
(* b) not be an object reference dictionary or marked content reference dictionary referencing a /Pg which is to be deleted *)
(* c) not be an object reference dictionary with a /Obj which is a deleted page *)
begin match Pdf.indirect_number pdf "/Pg" (Pdf.Indirect i), Pdf.indirect_number pdf "/Obj" (Pdf.Indirect i) with
| Some i, _ when mem i page_objnums_to_remove -> false
| _, Some i when mem i page_objnums_to_remove -> false
| _ -> true
end
| _ -> true
in
if List.for_all survives objs then None else Some (keep survives objs)
in
let process_indirect d is =
begin match process is with
| None -> () (* no change *)
| Some [] ->
(* empty not allowed - we must now delete this object *)
del := n::!del
| Some newlist ->
(* update the value of /K and change the object in place *)
begin match newlist with
| [e] -> replaceobjs =| (n, Pdf.replace_dict_entry (Pdf.Dictionary d) "/K" e)
| e::es -> replaceobjs =| (n, Pdf.replace_dict_entry (Pdf.Dictionary d) "/K" (Pdf.Array (e::es)))
| _ -> assert false
end
end
in
match Pdf.direct pdf o with
| Pdf.Dictionary d ->
begin match List.assoc_opt "/K" d with
| Some (Pdf.Integer _) -> ()
| Some (Pdf.Indirect i) -> process_indirect d [Pdf.Indirect i]
| Some (Pdf.Array objs) -> process_indirect d objs
| _ -> () (* /K can exist in transparency object, but is a boolean so ok. *)
end
| _ -> ())
pdf
done
(* Merge structure hierarchy / tagged PDF. Asterisked items will require
further work when we find good examples.
/IDTree name tree *merge
/ParentTree number tree renumber and merge
/ParentTreeNextKey integer remove
/RoleMap dict *merge
/ClassMap dict *merge
/Namespaces array merge
/PronunciationLexicon array concatenate
/AF array concatenate
/K structure tree merge trees *)
let print_parent_tree =
iter (fun (a, b) -> Printf.printf "%s -> %s\n" a (Pdfwrite.string_of_pdf b))
let renumber_parent_trees pdfs =
if length pdfs = 1 then () else
let parent_trees =
map
(fun pdf ->
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/ParentTree"] with
| Some t -> Pdftree.read_number_tree pdf t
| None -> [])
pdfs
in
(* iter2 (fun pt n -> Printf.printf "****************** PARENT TREE %i:\n" n; print_parent_tree pt) parent_trees (ilist 1 (length pdfs)); *)
(* Calculate a renumbering mapping from (pdf number, parent tree number) to 0,1,2.... *)
let num = ref 0 in
let rs = Hashtbl.create 256 in
iter2
(fun pt pdfn ->
iter (fun (k, _) -> Hashtbl.add rs (pdfn, int_of_string k) !num; num += 1) pt)
parent_trees
(ilist 1 (length pdfs));
(* Process all /StructParent(s) dictionary entries to point to new /ParentTree entries. *)
let replace_any_structparent n = function
| ("/StructParent" | "/StructParents") as k, Pdf.Integer i ->
begin match Hashtbl.find_opt rs (n, i) with
| Some i' -> (k, Pdf.Integer i')
| None -> (k, Pdf.Integer i)
end
| x -> x
in
let rec f n = function
| Pdf.Dictionary d ->
Pdf.recurse_dict (f n) (map (replace_any_structparent n) d)
| Pdf.Array a ->
Pdf.recurse_array (f n) a
| Pdf.Stream {contents = (Pdf.Dictionary d, s)} ->
Pdf.Stream {contents = (Pdf.recurse_dict (f n) (map (replace_any_structparent n) d), s)}
| x -> x
in
iter2
(fun pdf n ->
Pdf.objselfmap (f n) pdf)
pdfs (ilist 1 (length pdfs));
(* Write the new parent tree to each file *)
let renumbered_parent_trees =
map2
(fun pt pdfnum ->
map
(fun (k, v) -> match Hashtbl.find_opt rs (pdfnum, int_of_string k) with Some k' -> (string_of_int k', v) | None -> assert false)
pt)
parent_trees
(ilist 1 (length pdfs))
in
(* iter2 (fun pt n -> Printf.printf "****************** FINAL PARENT TREE %i:\n" n; print_parent_tree pt) renumbered_parent_trees (ilist 1 (length pdfs)); *)
iter2
(fun pdf renumbered ->
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/ParentTree"] with
| None -> ()
| Some t -> Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"; "/ParentTree"] (Pdftree.build_name_tree true pdf renumbered))
pdfs
renumbered_parent_trees
(*;
(* Write the PDFs to file to check them *)
iter2
(fun n pdf -> Pdfwrite.pdf_to_file pdf (string_of_int n ^ ".pdf"))
(ilist 1 (length pdfs))
pdfs*)
(* If add_toplevel_document is true, we add a PDF/UA-2 top-level /Document at the top of the structure tree. *)
let merge_structure_trees ?(add_toplevel_document=false) pdf pdfs =
let get_all struct_tree_roots pdf name =
option_map
(fun str -> Pdf.lookup_direct pdf name str) struct_tree_roots
in
let merge_dicts dicts =
fold_left
(fun d (k, v) -> Pdf.add_dict_entry d k v)
(Pdf.Dictionary [])
(flatten
(setify
(option_map
(function
| Pdf.Dictionary d -> Some d
| _ -> Pdfe.log "merge_dicts: not a dict"; None) dicts)))
in
let merge_arrays arrays =
Pdf.Array
(flatten
(setify
(option_map
(function
| Pdf.Array a -> Some a
| _ -> Pdfe.log "merge_array: not an array"; None) arrays)))
in
let mkarray = function
| Pdf.Array a -> Pdf.Array a
| x -> Pdf.Array [x]
in
let struct_tree_roots, struct_tree_objnums =
split
(option_map
(fun pdf ->
let catalog = Pdf.catalog_of_pdf pdf in
match Pdf.lookup_direct pdf "/StructTreeRoot" catalog with
| None -> None
| Some str ->
Some
(str,
match catalog with
| Pdf.Dictionary d ->
begin match lookup "/StructTreeRoot" d with Some (Pdf.Indirect i) -> i | _ -> 0 end
| _ -> raise (Pdf.PDFError "merge_structure_hierarchy: bad catalog")))
pdfs)
in
match struct_tree_roots with
| [] -> None
| [x] ->
Some (hd struct_tree_objnums) (* if only one, don't interfere, just preserve it. *)
| _ ->
let merged_idtree =
Pdftree.merge_name_trees_no_clash pdf (get_all struct_tree_roots pdf "/IDTree")
in
let merged_parenttree =
Pdftree.merge_number_trees_no_clash pdf (get_all struct_tree_roots pdf "/ParentTree")
in
let merged_rolemap =
merge_dicts (get_all struct_tree_roots pdf "/RoleMap") in
let merged_classmap =
merge_dicts (get_all struct_tree_roots pdf "/ClassMap") in
let merged_namespaces =
merge_arrays (get_all struct_tree_roots pdf "/Namespaces") in
let merged_pronunciation_lexicon =
merge_arrays (get_all struct_tree_roots pdf "/PronunciationLexicon") in
let merged_af =
merge_arrays (get_all struct_tree_roots pdf "/AF") in
let struct_tree_objnum = Pdf.addobj pdf Pdf.Null in
let merged_k =
(* 1. Get indirect references to each existing structure tree root object. They should be indirect, because /Ps will
need to point up to them, but may not be - so if not indirect, keep direct. *)
let existing_ks_of_struct_tree_roots_as_mostly_indirects =
flatten
(map
(fun root ->
match Pdf.lookup_immediate "/K" root with
| Some (Pdf.Indirect i) -> [Pdf.Indirect i]
| Some (Pdf.Array a) -> a
| Some x -> [x]
| None -> [])
struct_tree_roots)
in
(* 2. Rewrite in-place each previous indirect struct tree root /K member to have a /P pointing up to the new struct tree root. *)
mkarray
(Pdf.Array
(map
(function
| Pdf.Indirect i ->
let d = Pdf.lookup_obj pdf i in
Pdf.addobj_given_num pdf (i, Pdf.add_dict_entry d "/P" (Pdf.Indirect struct_tree_objnum));
Pdf.Indirect i
| Pdf.Dictionary _ as d ->
Pdf.add_dict_entry d "/P" (Pdf.Indirect struct_tree_objnum)
| x -> x)
existing_ks_of_struct_tree_roots_as_mostly_indirects))
in
let optional n = function
| Pdf.Dictionary [] -> []
| Pdf.Array [] -> []
| x -> [(n, Pdf.Indirect (Pdf.addobj pdf x))]
in
let standard_entries =
["/Type", Pdf.Name "/StructTreeRoot"]
@ optional "/IDTree" merged_idtree
@ optional "/ParentTree" merged_parenttree
@ optional "/RoleMap" merged_rolemap
@ optional "/ClassMap" merged_classmap
@ optional "/PronunciationLexicon" merged_pronunciation_lexicon
@ optional "/AF" merged_af
in
if add_toplevel_document then
begin
let merged_namespaces, pdf2_namespace_dictionary =
let namespace_objnum = Pdf.addobj pdf (Pdf.Dictionary [("/NS", Pdf.String "http://iso.org/pdf2/ssn")]) in
let merged_namespaces =
match merged_namespaces with
| Pdf.Array a -> Pdf.Array (Pdf.Indirect namespace_objnum::a)
| x -> x
in
(merged_namespaces, namespace_objnum)
in
let top_level_objnum =
Pdf.addobj pdf
(Pdf.Dictionary
[("/NS", Pdf.Indirect pdf2_namespace_dictionary);
("/S", Pdf.Name "/Document");
("/P", Pdf.Indirect struct_tree_objnum);
("/K", merged_k)])
in
let new_dict = Pdf.Dictionary (standard_entries @ optional "/Namespaces" merged_namespaces @ [("/K", Pdf.Indirect top_level_objnum)]) in
Pdf.addobj_given_num pdf (struct_tree_objnum, new_dict);
Some struct_tree_objnum
end
else
begin
let new_dict = Pdf.Dictionary (standard_entries @ optional "/Namespaces" merged_namespaces @ optional "/K" merged_k) in
Pdf.addobj_given_num pdf (struct_tree_objnum, new_dict);
Some struct_tree_objnum
end