-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathcpdfcomposition.ml
174 lines (162 loc) · 6.96 KB
/
cpdfcomposition.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
open Pdfutil
let find_composition_structure_info pdf marked =
match Pdf.lookup_obj pdf pdf.Pdf.root with
| Pdf.Dictionary d ->
begin match lookup "/StructTreeRoot" d with
| Some x ->
let l = ref [] in
let objs = Pdf.objects_referenced ["/Pg"] [] pdf x in
iter
(fun i ->
match Hashtbl.find marked i with
| () -> ()
| exception Not_found -> l := i::!l; Hashtbl.add marked i ())
objs;
!l
| _ -> []
end
| _ -> []
let find_composition_piece_info pdf marked =
let l = ref [] in
Pdf.objiter
(fun i obj ->
match Pdf.lookup_direct pdf "/PieceInfo" obj with
| Some x ->
let objs = Pdf.objects_referenced [] [] pdf x in
iter
(fun i ->
match Hashtbl.find marked i with
| () -> ()
| exception Not_found -> l := i::!l; Hashtbl.add marked i ())
objs
| None -> ())
pdf;
!l
let find_composition_images pdf i obj marked =
match Hashtbl.find marked i with () -> [] | exception Not_found ->
match Pdf.lookup_direct pdf "/Subtype" obj with
| Some (Pdf.Name "/Image") ->
Hashtbl.add marked i (); [i]
| _ -> []
(* If it has /Font, find all objects referenced from it, and add
any not already marked to the count *)
let find_composition_fonts pdf i obj marked =
match Hashtbl.find marked i with () -> [] | exception Not_found ->
let l = ref [] in
match Pdf.lookup_direct pdf "/Type" obj with
| Some (Pdf.Name "/Font") ->
iter
(fun i ->
(*Printf.printf "Object %i\n%s\n" i (Pdfwrite.string_of_pdf (Pdf.lookup_obj pdf i));*)
match Hashtbl.find marked i with
| () -> ()
| exception Not_found -> l := i::!l; Hashtbl.add marked i ())
(Pdf.objects_referenced [] [] pdf (Pdf.Indirect i));
!l
| _ -> []
let find_composition_content_streams pdf i obj marked =
match Hashtbl.find marked i with () -> [] | exception Not_found ->
match Pdf.lookup_direct pdf "/Type" obj with
| Some (Pdf.Name "/Page") ->
(*Printf.printf "Found a page...%s\n" (Pdfwrite.string_of_pdf (Pdf.direct pdf obj));*)
let cs =
match obj with Pdf.Dictionary d ->
begin match lookup "/Contents" d with
| Some (Pdf.Indirect i) -> [i]
| Some (Pdf.Array is) -> option_map (function Pdf.Indirect i -> Some i | _ -> None) is
| _ -> []
end
| _ -> []
in
(*Printf.printf "Found %i content streams\n" (length cs);*)
let l = ref [] in
iter
(fun i ->
(*Printf.printf "Considering content stream %i\n" i;*)
match Hashtbl.find marked i with
| () -> ()
| exception Not_found -> Hashtbl.add marked i (); l := i::!l)
cs;
!l
| _ ->
match Pdf.lookup_direct pdf "/Subtype" obj with
| Some (Pdf.Name "/Form") ->
Hashtbl.add marked i ();
[i]
| _ -> []
let find_composition pdf =
let marked = null_hash () in
let images = ref [] in
let fonts = ref [] in
let content_streams = ref [] in
Pdf.objiter
(fun i obj ->
(*Printf.printf "Looking at object %i\n" i;
Printf.printf "Which is %s\n" (Pdfwrite.string_of_pdf (Pdf.lookup_obj pdf i));
Printf.printf "Marked objects at beginning: ";
Hashtbl.iter (fun k () -> Printf.printf "%i " k) marked;
Printf.printf "\n";*)
match Hashtbl.find marked i with _ -> () | exception Not_found ->
images := find_composition_images pdf i obj marked @ !images;
content_streams := find_composition_content_streams pdf i obj marked @ !content_streams;
fonts := find_composition_fonts pdf i obj marked @ !fonts)
pdf;
let structure_info = find_composition_structure_info pdf marked in
let piece_info = find_composition_piece_info pdf marked in
(!images, !fonts, !content_streams, structure_info, piece_info)
let size pdf i =
String.length (Pdfwrite.string_of_pdf_including_data (Pdf.lookup_obj pdf i))
let compressed_size pdf objnums =
if objnums = [] then 0 else
(* If there were object streams, assume objects were in them, and compressed with FlateDecode *)
if Hashtbl.length pdf.Pdf.objects.Pdf.object_stream_ids = 0 then
sum (map (size pdf) (setify objnums))
else
let b = Buffer.create 262144 in
let streams = ref 0 in
iter
(fun i ->
match Pdf.lookup_obj pdf i with
| Pdf.Stream _ -> streams += size pdf i
| obj -> Buffer.add_string b (Pdfwrite.string_of_pdf_including_data obj))
objnums;
!streams + Pdfio.bytes_size (Pdfcodec.encode_flate (Pdfio.bytes_of_string (Buffer.contents b)))
(* If no object streams, calculate the size of the xref table. If streams, the xref stream total *)
let compressed_xref_table_size pdf =
if Hashtbl.length pdf.Pdf.objects.Pdf.object_stream_ids = 0 then 20 * Pdf.objcard pdf else
compressed_size pdf (map fst (list_of_hashtbl pdf.Pdf.objects.Pdf.object_stream_ids))
let show_composition_json filesize pdf =
let perc x = float_of_int x /. float_of_int filesize *. 100. in
let o_images, o_fonts, o_content_streams, o_structure_info, o_piece_info = find_composition pdf in
let images, fonts, content_streams, structure_info, attached_files, piece_info, xref_table =
compressed_size pdf o_images,
compressed_size pdf o_fonts,
compressed_size pdf o_content_streams,
compressed_size pdf o_structure_info,
Cpdfattach.size_attached_files pdf,
compressed_size pdf o_piece_info,
compressed_xref_table_size pdf
in
let r = images + fonts + content_streams + structure_info + attached_files + xref_table + piece_info in
`List [`List [`String "Images"; `Int images; `Float (perc images)];
`List [`String "Fonts"; `Int fonts; `Float (perc fonts)];
`List [`String "Content streams"; `Int content_streams; `Float (perc content_streams)];
`List [`String "Structure Info"; `Int structure_info; `Float (perc structure_info)];
`List [`String "Attached Files"; `Int attached_files; `Float (perc attached_files)];
`List [`String "XRef Table"; `Int xref_table; `Float (perc xref_table)];
`List [`String "Piece Info"; `Int piece_info; `Float (perc piece_info)];
`List [`String "Unclassified"; `Int (filesize - r); `Float (perc (filesize - r))]]
let show_composition_json_blob filesize pdf =
Pdfio.bytes_of_string (Cpdfyojson.Safe.pretty_to_string (show_composition_json filesize pdf))
let show_composition filesize json pdf =
let module J = Cpdfyojson.Safe in
let j = show_composition_json filesize pdf in
if json then (flprint (J.pretty_to_string j); flprint "\n") else
match j with
| `List js ->
iter
(function
| `List [`String a; `Int b; `Float c] -> Printf.printf "%s: %i bytes (%.2f%%)\n" a b c
| _ -> ())
js
| _ -> ()