-
Notifications
You must be signed in to change notification settings - Fork 0
/
annogen.py
executable file
·5530 lines (5331 loc) · 368 KB
/
annogen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# (compatible with both Python 2.7 and Python 3)
"Annotator Generator v3.391 (c) 2012-24 Silas S. Brown"
# See http://ssb22.user.srcf.net/adjuster/annogen.html
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# If you want to compare this code to old versions, the old
# versions are being kept in the E-GuideDog SVN repository on
# http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster
# and on GitHub at https://github.com/ssb22/adjuster
# and on GitLab at https://gitlab.com/ssb22/adjuster
# and on BitBucket https://bitbucket.org/ssb22/adjuster
# and at https://gitlab.developers.cam.ac.uk/ssb22/adjuster
# and in China: https://gitee.com/ssb22/adjuster
# although some early ones are missing.
import sys,os,os.path,tempfile,time,re,subprocess,unicodedata
import json,codecs
from optparse import OptionParser
if '--version' in sys.argv:
print (__doc__+"\nLicensed under the Apache License, Version 2.0") ; sys.exit()
elif '--html-options' in sys.argv:
print ("Options for "+__doc__[:__doc__.index("(c)")].strip()+":<dl>")
class HTMLOptions:
def add_option(self,*args,**kwargs):
if not 'action' in kwargs: args=[a+'=' if a.startswith('--') else a for a in args]
print ("<dt><kbd>"+"</kbd>, <kbd>".join(args)+"</kbd></dt><dd>"+re.sub('(?<=[A-Za-z])([/=_])(?=[A-Za-z])',r'<wbr/>\1',re.sub('(--[A-Za-z-]*)',r'<kbd>\1</kbd>',kwargs.get("help","").replace("%default",str(kwargs.get("default","%default"))).replace('&','&').replace('<','<').replace('>','>'))).replace("BEFORE","<strong>before</strong>").replace("AFTER","<strong>after</strong>").replace("ALWAYS","<strong>always</strong>").replace(" ALL "," <strong>all</strong> ").replace(" LONG "," <strong>long</strong> ").replace(" NOT "," <strong>not</strong> ").replace("WITHOUT","<strong>without</strong>").replace("js:search:replace,","js:<wbr>search:<wbr>replace,<wbr>")+"</dd>")
parser = HTMLOptions()
parser.add_option("-h","--help",action=True,help="show this help message and exit")
elif '--markdown-options' in sys.argv:
l = "Options for "+__doc__[:__doc__.index("(c)")].strip()
print (l) ; print ("="*len(l)) ; print ("")
class MarkdownOptions:
def add_option(self,*args,**kwargs):
if not 'action' in kwargs: args=[a+'=' if a.startswith('--') else a for a in args]
d = str(kwargs.get("default","%default"))
if "://" in d or "<" in d: d="`"+d+"`"
print ("`"+"`, `".join(args)+"`\n : "+re.sub('(--[A-Za-z-]*)',r'`\1`',kwargs.get("help","").replace("%default",d)).replace("BEFORE","**before**").replace("AFTER","**after**").replace("ALWAYS","**always**").replace(" ALL "," **all** ").replace(" LONG "," **long** ").replace(" NOT "," **not** ").replace("WITHOUT","**without**")+"\n")
parser = MarkdownOptions()
parser.add_option("-h","--help",action=True,help="show this help message and exit")
else: parser = OptionParser()
try: from subprocess import getoutput
except: from commands import getoutput
if not "mac" in sys.platform and not "darwin" in sys.platform and ("win" in sys.platform or "mingw32" in sys.platform): exe=".exe" # Windows, Cygwin, etc
else: exe=""
# =========== INPUT OPTIONS ==============
parser.add_option("--infile",
help="Filename of a text file (or a compressed .gz, .bz2 or .xz file or URL) to read the input examples from. If this is not specified, standard input is used.")
parser.add_option("--incode",default="utf-8",
help="Character encoding of the input file (default %default)")
parser.add_option("--mstart",
dest="markupStart",
default="<ruby><rb>",
help="The string that starts a piece of text with annotation markup in the input examples; default %default")
parser.add_option("--mmid",
dest="markupMid",
default="</rb><rt>",
help="The string that occurs in the middle of a piece of markup in the input examples, with the word on its left and the added markup on its right (or the other way around if mreverse is set); default %default")
parser.add_option("--mend",
dest="markupEnd",
default="</rt></ruby>",
help="The string that ends a piece of annotation markup in the input examples; default %default")
parser.add_option("-r","--mreverse",
action="store_true",default=False,
help="Specifies that the annotation markup is reversed, so the text BEFORE mmid is the annotation and the text AFTER it is the base text")
def cancelOpt(opt,act="store_false",dst=None):
if not dst: dst=opt.replace("-","_")
parser.add_option("--no-"+opt,action=act,dest=dst,help="Cancels any earlier --"+opt+" option in Makefile variables etc")
cancelOpt("mreverse")
parser.add_option("--end-pri",
help="Treat words that occur in the examples before this delimeter as having \"high priority\" for Yarowsky-like seed collocations (if these are in use). Normally the Yarowsky-like logic tries to identify a \"default\" annotation based on what is most common in the examples, with the exceptions indicated by collocations. If however a word is found in a high-priority section at the start, then the first annotation found there will be taken as the ideal \"default\" even if it's in a minority in the examples; everything else will be taken as an exception.")
parser.add_option("-s", "--spaces",
action="store_false",
dest="removeSpace",
default=True,
help="Set this if you are working with a language that uses whitespace in its non-markedup version (not fully tested). The default is to assume that there will not be any whitespace in the language, which is correct for Chinese and Japanese.")
cancelOpt("spaces","store_true","removeSpace")
parser.add_option("-c", "--capitalisation",
action="store_true",
default=False,
help="Don't try to normalise capitalisation in the input. Normally, to simplify the rules, the analyser will try to remove start-of-sentence capitals in annotations, so that the only remaining words with capital letters are the ones that are ALWAYS capitalised such as names. (That's not perfect: some words might always be capitalised just because they never occur mid-sentence in the examples.) If this option is used, the analyser will instead try to \"learn\" how to predict the capitalisation of ALL words (including start of sentence words) from their contexts.") # TODO: make the C program put the sentence capitals back
cancelOpt("capitalisation")
parser.add_option("-w", "--annot-whitespace",
action="store_true",
default=False,
help="Don't try to normalise the use of whitespace and hyphenation in the example annotations. Normally the analyser will try to do this, to reduce the risk of missing possible rules due to minor typographical variations.") # TODO: can this be extended to the point where the words 'try to' can be deleted ? see comments
cancelOpt("annot-whitespace")
parser.add_option("--keep-whitespace",
help="Comma-separated list of words (without annotation markup) for which whitespace and hyphenation should always be kept even without the --annot-whitespace option. Use when you know the variation is legitimate. This option expects words to be encoded using the system locale (UTF-8 if it cannot be detected).")
parser.add_option("--suffix",
help="Comma-separated list of annotations that can be considered optional suffixes for normalisation") # e.g. use --suffix=r if you have Mandarin Pinyin with inconsistent -r additions
parser.add_option("--suffix-minlen",
default=1,
help="Minimum length of word (in Unicode characters) to apply suffix normalisation")
parser.add_option("--post-normalise",
help="Filename of an optional Python module defining a dictionary called 'table' mapping integers to integers for arbitrary single-character normalisation on the Unicode BMP. This can reduce the size of the annotator. It is applied in post-processing (does not affect rules generation itself). For example this can be used to merge the recognition of Full, Simplified and Variant forms of the same Chinese character in cases where this can be done without ambiguity, if it is acceptable for the generated annotator to recognise mixed-script words should they occur. If any word in the examples has a different annotation when normalised than not, the normalised version takes precedence.")
parser.add_option("--glossfile",
help="Filename of an optional text file (or compressed .gz, .bz2 or .xz file or URL) to read auxiliary \"gloss\" information. Each line of this should be of the form: word (tab) annotation (tab) gloss. Extra tabs in the gloss will be converted to newlines (useful if you want to quote multiple dictionaries). When the compiled annotator generates ruby markup, it will add the gloss string as a popup title whenever that word is used with that annotation (before any reannotator option is applied). The annotation field may be left blank to indicate that the gloss will appear for all other annotations of that word. The entries in glossfile do NOT affect the annotation process itself, so it's not necessary to completely debug glossfile's word segmentation etc.")
parser.add_option("-C", "--gloss-closure",
help="If any Chinese, Japanese or Korean word is missing from glossfile, search its closure of variant characters also, using the Unihan variants file specified by this option")
cancelOpt("gloss-closure")
parser.add_option("-M","--glossmiss-omit",
action="store_true",
default=False,
help="Omit rules containing any word not mentioned in glossfile. Might be useful if you want to train on a text that uses proprietary terms and don't want to accidentally 'leak' those terms (assuming they're not accidentally included in glossfile also). Words may also be listed in glossfile with an empty gloss field to indicate that no gloss is available but rules using this word needn't be omitted.")
cancelOpt("glossmiss-omit")
parser.add_option("--words-omit",
help="File (or compressed .gz, .bz2 or .xz file or URL) containing words (one per line, without markup) to omit from the annotator. Use this to make an annotator smaller if for example if you're working from a rules file that contains long lists of place names you don't need this particular annotator to recognise but you still want to keep them as rules for other annotators, but be careful because any word on such a list gets omitted even if it also has other meanings (some place names are also normal words).")
parser.add_option("--manualrules",
help="Filename of an optional text file (or compressed .gz, .bz2 or .xz file or URL) to read extra, manually-written rules. Each line of this should be a marked-up phrase (in the input format) which is to be unconditionally added as a rule. Use this sparingly, because these rules are not taken into account when generating the others and they will be applied regardless of context (although a manual rule might fail to activate if the annotator is part-way through processing a different rule); try checking messages from --diagnose-manual.") # (or if there's a longer automatic match)
# =========== OUTPUT OPTIONS ==============
parser.add_option("--c-filename",default="",help="Where to write the C, C#, Python, Javascript, Go or Dart program. Defaults to standard output, or annotator.c in the system temporary directory if standard output seems to be the terminal (the program might be large, especially if Yarowsky-like indicators are not used, so it's best not to use a server home directory where you might have limited quota).") # because the main program might not be running on the launch node
parser.add_option("--c-compiler",default="cc -o annotator"+exe,help="The C compiler to run if generating C and standard output is not connected to a pipe. The default is to use the \"cc\" command which usually redirects to your \"normal\" compiler. You can add options (remembering to enclose this whole parameter in quotes if it contains spaces), but if the C program is large then adding optimisation options may make the compile take a LONG time. If standard output is connected to a pipe, then this option is ignored because the C code will simply be written to the pipe. You can also set this option to an empty string to skip compilation. Default: %default")
parser.add_option("--outcode",default="utf-8",
help="Character encoding to use in the generated parser (default %default, must be ASCII-compatible i.e. not utf-16)")
parser.add_option("--rulesFile",help="Filename of a JSON file to hold the accumulated rules. Adding .gz, .bz2 or .xz for compression is acceptable. If this is set then either --write-rules or --read-rules must be specified.")
parser.add_option("--write-rules",
action="store_true",default=False,
help="Write rulesFile instead of generating a parser. You will then need to rerun with --read-rules later.")
cancelOpt("write-rules")
parser.add_option("--read-rules",
action="store_true",default=False,
help="Read rulesFile from a previous run, and apply the output options to it. You should still specify the input formatting options (which should not change), and any glossfile or manualrules options (which may change), but no input is required.")
cancelOpt("read-rules")
parser.add_option("-E","--newlines-reset",
action="store_false",
dest="ignoreNewlines",
default=True,
help="Have the annotator reset its state on every newline byte. By default newlines do not affect state such as whether a space is required before the next word, so that if the annotator is used with Web Adjuster's htmlText option (which defaults to using newline separators) the spacing should be handled sensibly when there is HTML markup in mid-sentence.")
cancelOpt("newlines-reset","store_true","ignoreNewlines")
parser.add_option("-z","--compress",
action="store_true",default=False,
help="Compress annotation strings in the C code. This compression is designed for fast on-the-fly decoding, so it saves only a limited amount of space (typically 10-20%) but might help if RAM is short.")
cancelOpt("compress")
parser.add_option("-Z","--zlib",
action="store_true",default=False,
help="Compress the embedded data table using zlib (or pyzopfli if available), and include code to call zlib to decompress it on load. Useful if the runtime machine has the zlib library and you need to save disk space but not RAM (the decompressed table is stored separately in RAM, unlike --compress which, although giving less compression, at least works 'in place'). Once --zlib is in use, specifying --compress too will typically give an additional disk space saving of less than 1% (and a runtime RAM saving that's greater but more than offset by zlib's extraction RAM). If generating a Javascript annotator with zlib, the decompression code is inlined so there's no runtime zlib dependency, but startup can be ~50% slower so this option is not recommended in situations where the annotator is frequently reloaded from source (unless you're running on Node.js in which case loading is faster due to the use of Node's \"Buffer\" class).")
cancelOpt("zlib")
parser.add_option("-l","--library",
action="store_true",default=False,
help="Instead of generating C code that reads and writes standard input/output, generate a C library suitable for loading into Python via ctypes. This can be used for example to preload a filter into Web Adjuster to cut process-startup delays.")
cancelOpt("library")
parser.add_option("-W","--windows-clipboard",
action="store_true",default=False,
help="Include C code to read the clipboard on Windows or Windows Mobile and to write an annotated HTML file and launch a browser, instead of using the default cross-platform command-line C wrapper. See the start of the generated C file for instructions on how to compile for Windows or Windows Mobile.")
cancelOpt("windows-clipboard")
parser.add_option("--java",
help="Instead of generating C code, generate Java, and place the *.java files in the directory specified by this option. The last part of the directory should be made up of the package name; a double slash (//) should separate the rest of the path from the package name, e.g. --java=/path/to/wherever//org/example/annotator and the main class will be called Annotator.")
parser.add_option("--android",
help="URL for an Android app to browse (--java must be set). If this is set, code is generated for an Android app which starts a browser with that URL as the start page, and annotates the text on every page it loads. Use file:///android_asset/index.html for local HTML files in the assets directory; a clipboard viewer is placed in clipboard.html, and the app will also be able to handle shared text. If certain environment variables are set, this option can also compile and sign the app using Android SDK command-line tools (otherwise it puts a message on stderr explaining what needs to be set)")
parser.add_option("--android-template",
help="File to use as a template for Android start HTML. This option implies --android=file:///android_asset/index.html and generates that index.html from the file specified (or from a built-in default if the special filename 'blank' is used). The template file may include URL_BOX_GOES_HERE to show a URL entry box and related items (offline-clipboard link etc) in the page, in which case you can optionally define a Javascript function 'annotUrlTrans' to pre-convert some URLs from shortcuts etc; also enables better zoom controls on Android 4+, a mode selector if you use --annotation-names, a selection scope control on recent-enough WebKit, and a visible version stamp (which, if the device is in 'developer mode', you may double-tap on to show missing glosses). VERSION_GOES_HERE may also be included if you want to put it somewhere other than at the bottom of the page. If you do include URL_BOX_GOES_HERE you'll have an annotating Web browser app that allows the user to navigate to arbitrary URLs: as of 2020, this is acceptable on Google Play and Huawei AppGallery (non-China only from 2022), but NOT Amazon AppStore as they don't want 'competition' to their Silk browser.") # but some devices allow APKs to be 'side-loaded'. annotUrlTrans returns undefined = uses original
parser.add_option("--gloss-simplify",default="^to |^[(][^)]*[)] | [(][^)]*[)]|[;/].*",
help="A regular expression matching parts of glosses to remove when generating a '3-line' format in apps, but not for hover titles or popups. Default removes parenthesised expressions if not solitary, anything after the first slash or semicolon, and the leading word 'to'. Can be set to empty string to omit simplification.")
parser.add_option("-L","--pleco-hanping",
action="store_true",default=False,
help="In the Android app, make popup definitions link to Pleco or Hanping if installed")
cancelOpt("pleco-hanping")
parser.add_option("--bookmarks",
help="Android bookmarks: comma-separated list of package names that share our bookmarks. If this is not specified, the browser will not be given a bookmarks function. If it is set to the same value as the package specified in --java, bookmarks are kept in just this Android app. If it is set to a comma-separated list of packages that have also been generated by annogen (presumably with different annotation types), and if each one has the same android:sharedUserId attribute in AndroidManifest.xml's 'manifest' tag (you'll need to add this manually), and if the same certificate is used to sign all of them, then bookmarks can be shared across the set of browser apps. But beware the following two issues: (1) adding an android:sharedUserId attribute to an app that has already been released without one causes some devices to refuse the update with a 'cannot install' message (details via adb logcat; affected users would need to uninstall and reinstall instead of update, and some of them may not notice the instruction to do so); (2) this has not been tested with Google's new \"App Bundle\" arrangement, and may be broken if the Bundle results in APKs being signed by a different key. In June 2019 Play Console started issuing warnings if you release an APK instead of a Bundle, even though the \"size savings\" they mention are under 1% for annogen-generated apps.") # (the only resource that might vary by device is the launcher icon)
parser.add_option("-e","--epub",
action="store_true",default=False,
help="When generating an Android browser, make it also respond to requests to open EPUB files. This results in an app that requests the 'read external storage' permission on Android versions below 6, so if you have already released a version without EPUB support then devices running Android 5.x or below will not auto-update past this change until the user notices the update notification and approves the extra permission.") # see comments around READ_EXTERNAL_STORAGE below
cancelOpt("epub")
parser.add_option("--android-print",
action="store_true",default=False,
help="When generating an Android browser, include code to provide a Print option (usually print to PDF) and a simple highlight-selection option. The Print option will require Android 4.4, but the app should still run without it on earlier versions of Android.")
cancelOpt("android-print")
parser.add_option("--known-characters",help="When generating an Android browser, include an option to leave the most frequent characters unannotated as 'known'. This option should be set to the filename of a UTF-8 file of characters separated by newlines, assumed to be most frequent first, with characters on the same line being variants of each other (see --freq-count for one way to generate it). Words consisting entirely of characters found in the first N lines of this file (where N is settable by the user) will be unannotated until tapped on.")
parser.add_option("--freq-count",help="Name of a file to write that is suitable for the known-characters option, taken from the input examples (which should be representative of typical use). Any post-normalise table provided will be used to determine which characters are equivalent.")
parser.add_option("--android-audio",help="When generating an Android browser, include an option to convert the selection to audio using this URL as a prefix, e.g. https://example.org/speak.cgi?text= (use for languages not likely to be supported by the device itself). Optionally follow the URL with a space (quote carefully) and a maximum number of words to read in each user request. Setting a limit is recommended, or somebody somewhere will likely try 'Select All' on a whole book or something and create load problems. You should set a limit server-side too of course.") # do need https if we're Android 5+ and will be viewing HTTPS pages, or Chrome will block (OK if using EPUB-etc or http-only pages)
parser.add_option("--extra-js",help="Extra Javascript to inject into sites to fix things in the Android browser app. The snippet will be run before each scan for new text to annotate. You may also specify a file to read: [email protected] or [email protected],file2.js (do not use // comments in these files, only /* ... */ because newlines will be replaced), and you can create variants of the files by adding search-replace strings: [email protected]:search:replace,file2.js")
parser.add_option("--tts-js",action="store_true",default=False,help="Make Android 5+ multilingual Text-To-Speech functions available to extra-js scripts (see TTSInfo code for details)")
cancelOpt("tts-js")
parser.add_option("--existing-ruby-js-fixes",help="Extra Javascript to run in the Android browser app or browser extension whenever existing RUBY elements are encountered; the DOM node above these elements will be in the variable n, which your code can manipulate or replace to fix known problems with sites' existing ruby (such as common two-syllable words being split when they shouldn't be). Use with caution. You may also specify a file to read: [email protected]")
parser.add_option("--existing-ruby-lang-regex",help="Set the Android app or browser extension to remove existing ruby elements unless the document language matches this regular expression. If --sharp-multi is in use, you can separate multiple regexes with comma and any unset will always delete existing ruby. If this option is not set at all then existing ruby is always kept.")
parser.add_option("--existing-ruby-shortcut-yarowsky",action="store_true",default=False,help="Set the Android browser app to 'shortcut' Yarowsky-like collocation decisions when adding glosses to existing ruby over 2 or more characters, so that words normally requiring context to be found are more likely to be found without context (this may be needed because adding glosses to existing ruby is done without regard to context)") # (an alternative approach would be to collapse the existing ruby markup to provide the context, but that could require modifying the inner functions to 'see' context outside the part they're annotating)
parser.add_option("--extra-css",help="Extra CSS to inject into sites to fix things in the Android browser app. You may also specify a file to read [email protected]")
parser.add_option("--app-name",default="Annotating browser",
help="User-visible name of the Android app")
parser.add_option("--compile-only",
action="store_true",default=False,
help="Assume the code has already been generated by a previous run, and just run the compiler")
cancelOpt("compile-only")
parser.add_option("-j","--javascript",
action="store_true",default=False,
help="Instead of generating C code, generate JavaScript. This might be useful if you want to run an annotator on a device that has a JS interpreter but doesn't let you run your own binaries. The JS will be table-driven to make it load faster. See comments at the start for usage.") # but it's better to use the C version if you're in an environment where 'standard input' makes sense
cancelOpt("javascript")
parser.add_option("-6","--js-6bit",
action="store_true",default=False,
help="When generating a Javascript annotator, use a 6-bit format for many addresses to reduce escape codes in the data string by making more of it ASCII") # May result in marginally slower JS, but it should be smaller and parse more quickly on initial load, which is normally the dominant factor if you have to reload it on every page.
cancelOpt("js-6bit")
parser.add_option("-8","--js-octal",
action="store_true",default=False,
help="When generating a Javascript annotator, use octal instead of hexadecimal codes in the data string when doing so would save space. This does not comply with ECMAScript 5 and may give errors in its strict mode.")
cancelOpt("js-octal")
parser.add_option("-9","--ignore-ie8",
action="store_true",default=False,
help="When generating a Javascript annotator, do not make it backward-compatible with Microsoft Internet Explorer 8 and below. This may save a few bytes.")
cancelOpt("ignore-ie8")
parser.add_option("-u","--js-utf8",
action="store_true",default=False,
help="When generating a Javascript annotator, assume the script can use UTF-8 encoding directly and not via escape sequences. In some browsers this might work only on UTF-8 websites, and/or if your annotation can be expressed without the use of Unicode combining characters.")
cancelOpt("js-utf8")
parser.add_option("--browser-extension", help="Name of a Chrome or Firefox browser extension to generate. The extension will be placed in a directory of the same name (without spaces), which may optionally already exist and contain icons like 32.png and 48.png to be used.")
# To test the resulting extension locally:
# Firefox: about:debugging - 'this firefox' - load temporary add-on - manifest.json
# Chrome: chrome://extensions - Developer mode - Load unpacked - select the directory
# Chrome bug: browser_style true gives unreadable text in Chromium 89 with enable-force-dark set to "Enabled with selective inversion of everything" (and possibly other settings)
parser.add_option("--browser-extension-description", help="Description field to use when generating browser extensions")
parser.add_option("--manifest-v3",
action="store_true",default=False,
help="Use Manifest v3 instead of Manifest v2 when generating browser extensions (tested on Chrome only, and requires Chrome 88 or higher). This is now required for all Chrome Web Store uploads.")
parser.add_option("--gecko-id",help="a Gecko (Firefox) ID to embed in the browser extension")
parser.add_option("--dart",
action="store_true",default=False,
help="Instead of generating C code, generate Dart. This might be useful if you want to run an annotator in a Flutter application.")
cancelOpt("dart")
parser.add_option("--dart-datafile",
help="When generating Dart code, put annotator data into a separate file and open it using this pathname. Not compatible with Dart's \"Web app\" option, but might save space in a Flutter app (especially along with --zlib)")
parser.add_option("-Y","--python",
action="store_true",default=False,
help="Instead of generating C code, generate a Python module. Similar to the Javascript option, this is for when you can't run your own binaries, and it is table-driven for fast loading.")
cancelOpt("python")
parser.add_option("--reannotator",
help="Shell command through which to pipe each word of the original text to obtain new annotation for that word. This might be useful as a quick way of generating a new annotator (e.g. for a different topolect) while keeping the information about word separation and/or glosses from the previous annotator, but it is limited to commands that don't need to look beyond the boundaries of each word. If the command is prefixed by a # character, it will be given the word's existing annotation instead of its original text, and if prefixed by ## it will be given text#annotation. The command should treat each line of its input independently, and both its input and its output should be in the encoding specified by --outcode.") # TODO: reannotatorCode instead? (see other 'reannotatorCode' TODOs)
# (Could just get the reannotator to post-process the 1st annotator's output, but that might be slower than generating an altered annotator with it)
parser.add_option("-A","--reannotate-caps",
action="store_true",default=False,
help="When using --reannotator, make sure to capitalise any word it returns that began with a capital on input")
cancelOpt("reannotate-caps")
parser.add_option("--sharp-multi",
action="store_true",default=False,
help="Assume annotation (or reannotator output) contains multiple alternatives separated by # (e.g. pinyin#Yale) and include code to select one by number at runtime (starting from 0). This is to save on total space when shipping multiple annotators that share the same word grouping and gloss data, differing only in the transcription of each word.")
cancelOpt("sharp-multi")
parser.add_option("--annotation-names",help="Comma-separated list of annotation types supplied to sharp-multi (e.g. Pinyin,Yale), if you want the Android app etc to be able to name them. You can also set just one annotation names here if you are not using sharp-multi.")
parser.add_option("--annotation-map",help="Comma-separated list of annotation-number overrides for sharp-multi, e.g. 7=3 to take the 3rd item if a 7th is selected") # this one starts at 1 rather than 0
parser.add_option("--annotation-postprocess",help="Extra code for post-processing specific annotNo selections after retrieving from a sharp-multi list (@file is allowed)")
# =========== ANALYSIS OPTIONS ==============
parser.add_option("-o", "--allow-overlaps",
action="store_true",default=False,
help="Normally, the analyser avoids generating rules that could overlap with each other in a way that would leave the program not knowing which one to apply. If a short rule would cause overlaps, the analyser will prefer to generate a longer rule that uses more context, and if even the entire phrase cannot be made into a rule without causing overlaps then the analyser will give up on trying to cover that phrase. This option allows the analyser to generate rules that could overlap, as long as none of the overlaps would cause actual problems in the example phrases. Thus more of the examples can be covered, at the expense of a higher risk of ambiguity problems when applying the rules to other texts. See also the -y option.")
cancelOpt("allow-overlaps")
parser.add_option("-y","--ybytes",default=0,
help="Look for candidate Yarowsky seed-collocations within this number of bytes of the end of a word. If this is set then overlaps and rule conflicts will be allowed when seed collocations can be used to distinguish between them, and the analysis is likely to be faster. Markup examples that are completely separate (e.g. sentences from different sources) must have at least this number of (non-whitespace) bytes between them.")
parser.add_option("--ybytes-max",default=0,
help="Extend the Yarowsky seed-collocation search to check over larger ranges up to this maximum. If this is set then several ranges will be checked in an attempt to determine the best one for each word, but see also ymax-threshold and ymax-limitwords.")
parser.add_option("--ymax-threshold",default=1,
help="Limits the length of word that receives the narrower-range Yarowsky search when ybytes-max is in use. For words longer than this, the search will go directly to ybytes-max. This is for languages where the likelihood of a word's annotation being influenced by its immediate neighbours more than its distant collocations increases for shorter words, and less is to be gained by comparing different ranges when processing longer words. Setting this to 0 means no limit, i.e. the full range will be explored on ALL Yarowsky checks.") # TODO: see TODO below re temporary recommendation of --ymax-threshold=0
parser.add_option("--ymax-limitwords",
help="Comma-separated list of words (without annotation markup) for which the ybytes expansion loop should run at most two iterations. This may be useful to reduce compile times for very common ambiguous words that depend only on their immediate neighbours. Annogen may suggest words for this option if it finds they take inordinate time to process.") # two iterations rather than one increases the rate of correctly handling things like 'yi/bu sandhi before duoyinzi' in Chinese, where the next TWO characters matter because the sandhi tone depends on how the duoyinzi resolves (which is often determined by the 3rd character, although this shortcut may not catch some rare cases where it's determined by one further on)
parser.add_option("--ybytes-step",default=3,
help="The increment value for the loop between ybytes and ybytes-max")
parser.add_option("-k","--warn-yarowsky",
action="store_true",default=False,
help="Warn when absolutely no distinguishing Yarowsky seed collocations can be found for a word in the examples")
cancelOpt("warn-yarowsky")
parser.add_option("-K","--yarowsky-all",
action="store_true",default=False,
help="Accept Yarowsky seed collocations even from input characters that never occur in annotated words (this might include punctuation and example-separation markup)")
cancelOpt("yarowsky-all")
parser.add_option("--yarowsky-multiword",
action="store_true",default=False,
help="Check potential multiword rules for Yarowsky seed collocations also. Without this option (default), only single-word rules are checked.") # multiword might not work so well
cancelOpt("yarowsky-multiword")
parser.add_option("--yarowsky-thorough",
action="store_true",default=False,
help="Recheck Yarowsky seed collocations when checking if any multiword rule would be needed to reproduce the examples. This could risk 'overfitting' the example set.") # (more likely to come up with rules that aren't really needed and end with 1st half of a sandhi etc)
cancelOpt("yarowsky-thorough")
parser.add_option("--yarowsky-half-thorough",
action="store_true",default=False,
help="Like --yarowsky-thorough but check only what collocations occur within the proposed new rule (not around it), less likely to overfit")
cancelOpt("yarowsky-half-thorough")
parser.add_option("--yarowsky-debug",default=1,
help="Report the details of seed-collocation false positives if there are a large number of matches and at most this number of false positives (default %default). Occasionally these might be due to typos in the corpus, so it might be worth a check.")
parser.add_option("--normalise-debug",default=1,
help="When --capitalisation is not in effect. report words that are usually capitalised but that have at most this number of lower-case exceptions (default %default) for investigation of possible typos in the corpus")
parser.add_option("--normalise-cache",
help="Optional file to use to cache the result of normalisation. Adding .gz, .bz2 or .xz for compression is acceptable.")
parser.add_option("-1","--single-words",
action="store_true",default=False,
help="Do not generate any rule longer than 1 word, although it can still have Yarowsky seed collocations if -y is set. This speeds up the search, but at the expense of thoroughness. You might want to use this in conjuction with -y to make a parser quickly.")
cancelOpt("single-words")
parser.add_option("--max-words",default=0,
help="Limits the number of words in a rule. 0 means no limit. --single-words is equivalent to --max-words=1. If you need to limit the search time, and are using -y, it should suffice to use --single-words for a quick annotator or --max-words=5 for a more thorough one (or try 3 if --yarowsky-half-thorough is in use).") # (There was a bug in annogen versions before 0.58 that caused --max-words to additionally limit how far away from the start of its phrase a rule-example must be placed; this has now been fixed. There was also a bug that resulted in too many extra rules being tested over already-catered-for phrases; as this has now been fixed, the additional benefit of a --max-words limit is now reduced, but you might want to put one in anyway. That second bug also had the effect of the coverage % being far too low in the progress stats.)
parser.add_option("--multiword-end-avoid",
help="Comma-separated list of words (without annotation markup) that should be avoided at the end of a multiword rule (e.g. sandhi likely to depend on the following word)")
parser.add_option("-d","--diagnose",help="Output some diagnostics for the specified word. Use this option to help answer \"why doesn't it have a rule for...?\" issues. This option expects the word without markup and uses the system locale (UTF-8 if it cannot be detected).")
parser.add_option("--diagnose-limit",default=10,help="Maximum number of phrases to print diagnostics for (0 means unlimited). Default: %default")
parser.add_option("-m","--diagnose-manual",
action="store_true",default=False,
help="Check and diagnose potential failures of --manualrules")
cancelOpt("diagnose-manual")
parser.add_option("-q","--diagnose-quick",
action="store_true",default=False,
help="Ignore all phrases that do not contain the word specified by the --diagnose option, for getting a faster (but possibly less accurate) diagnostic. The generated annotator is not likely to be useful when this option is present.")
cancelOpt("diagnose-quick")
parser.add_option("--priority-list",help="Instead of generating an annotator, use the input examples to generate a list of (non-annotated) words with priority numbers, a higher number meaning the word should have greater preferential treatment in ambiguities, and write it to this file (or compressed .gz, .bz2 or .xz file). If the file provided already exists, it will be updated, thus you can amend an existing usage-frequency list or similar (although the final numbers are priorities and might no longer match usage-frequency exactly). The purpose of this option is to help if you have an existing word-priority-based text segmenter and wish to update its data from the examples; this approach might not be as good as the Yarowsky-like one (especially when the same word has multiple readings to choose from), but when there are integration issues with existing code you might at least be able to improve its word-priority data.")
parser.add_option("-t","--time-estimate",
action="store_true",default=False,
help="Estimate time to completion. The code to do this is unreliable and is prone to underestimate. If you turn it on, its estimate is displayed at the end of the status line as days, hours or minutes.") # Unreliable because the estimate assumes 'phrases per minute' will remain constant on average, whereas actually it will decrease because the more complex phrases are processed last
cancelOpt("time-estimate")
parser.add_option("-0","--single-core",
action="store_true",default=False,
help="Use only one CPU core even when others are available on Unix")
cancelOpt("single-core")
parser.add_option("--cores-command",help="Command to run when changing the number of CPU cores in use (with new number as a parameter); this can run a script to pause/resume any lower-priority load")
parser.add_option("-p","--status-prefix",help="Label to add at the start of the status line, for use if you batch-run annogen in multiple configurations and want to know which one is currently running")
if '--html-options' in sys.argv or '--markdown-options' in sys.argv:
if '--html-options' in sys.argv:
print ("</dl>")
sys.exit()
term = os.environ.get("TERM","")
is_xterm = "xterm" in term
ansi_escapes = is_xterm or term in ["screen","linux"]
def isatty(f): return hasattr(f,"isatty") and f.isatty()
if ansi_escapes and isatty(sys.stderr): clear_eol,reverse_on,reverse_off,bold_on,bold_off="\x1b[K","\x1b[7m","\x1b[0m","\x1b[1m","\x1b[0m"
else: clear_eol,reverse_on,reverse_off,bold_on,bold_off=" "," **","** ","",""
sys.stderr.write(bold_on+__doc__+bold_off+"\n") # not sys.stdout: may or may not be showing --help (and anyway might want to process the help text for website etc)
options, args = parser.parse_args()
globals().update(options.__dict__)
try: import thread
except: import _thread as thread # Python 3
import gc ; gc.disable() # should be OK if we don't create cycles (TODO: run gc.collect() manually after init, just in case?)
def warn(msg):
sys.stderr.write("Warning: "+msg+"\n")
if "PyPy" in sys.version: warn("with annogen, PyPy is likely to run 60% slower than python") # (not to mention concurrent.futures being less likely to be available)
if ybytes: ybytes=int(ybytes)
if ybytes_max: ybytes_max=int(ybytes_max)
else: ybytes_max = ybytes
if yarowsky_debug: yarowsky_debug=int(yarowsky_debug)
else: yarowsky_debug = 0
if normalise_debug: normalise_debug=int(normalise_debug)
else: normalise_debug = 0
ybytes_step = int(ybytes_step)
ymax_threshold = int(ymax_threshold)
def errExit(msg):
try:
if not outfile==getBuf(sys.stdout):
outfile.close() ; rm_f(c_filename)
except: pass # works only if got past outfile opening
sys.stderr.write(msg+"\n") ; sys.exit(1)
if args: errExit("Unknown argument "+repr(args[0]))
if sharp_multi and not annotation_names and (browser_extension or existing_ruby_lang_regex): errExit("--sharp-multi requires --annotation-names to be set if --browser-extension or --existing-ruby-lang-regex")
if existing_ruby_lang_regex:
while len(existing_ruby_lang_regex.split(','))<len(annotation_names.split(',')): existing_ruby_lang_regex += r",^\b$"
if browser_extension: javascript = True
if android_template:
android = "file:///android_asset/index.html"
if android and not java: errExit('You must set --java=/path/to/src//name/of/package when using --android')
if bookmarks and not android: errExit("--bookmarks requires --android, e.g. --android=file:///android_asset/index.html")
if '/' in re.sub(r"\[[^]]*\]","",gloss_simplify) and (javascript or android): errExit("Any / in gloss_simplify must be protected for Javascript")
if known_characters and not (android or javascript): errExit("--known-characters requires --android, --javascript or --browser-extension")
if known_characters and freq_count: errExit("--known-characters and --freq-count must be on separate runs in the current implementation") # otherwise need to postpone loading known_characters
if known_characters and android and not android_template and not ("ANDROID_NO_UPLOAD" in os.environ and "GOOGLE_PLAY_TRACK" in os.environ): warn("known-characters without android-template means you call the Javascript functions yourself")
if android_print and not bookmarks: errExit("The current implementation of --android-print requires --bookmarks to be set as well")
if android_audio:
if not android_print: errExit("The current implementation of --android-audio requires --android-print to be set as well") # for the highlighting (and TODO: I'm not sure about the HTML5-Audio support of Android 2.x devices etc, so should we check a minimum Android version before making the audio option available? as highlight option can be done pre-4.4 just no way to save the result)
if "'" in android_audio or '"' in android_audio or '\\' in android_audio: errExit("The current implementation of --android-audio requires the URL not to contain any quotes or backslashes, please percent-encode them")
if ' ' in android_audio:
android_audio,android_audio_maxWords = android_audio.split()
android_audio_maxWords = int(android_audio_maxWords)
else: android_audio_maxWords=None
if (extra_js or extra_css or tts_js) and not android: errExit("--extra-js, --tts-js and --extra-css require --android")
if (existing_ruby_lang_regex or existing_ruby_js_fixes) and not (android or javascript): errExit("--existing-ruby-lang-regex and --existing-ruby-js-fixes require --android, --javascript or --browser-extension")
if not extra_css: extra_css = ""
if not extra_js: extra_js = ""
if not existing_ruby_js_fixes: existing_ruby_js_fixes = ""
if not annotation_postprocess: annotation_postprocess = ""
if extra_css.startswith("@"): extra_css = open(extra_css[1:],"rb").read()
if annotation_postprocess.startswith("@"): annotation_postprocess = open(annotation_postprocess[1:],"rb").read()
if annotation_postprocess and not java: errExit("--annotation-postprocess is currently implemented only for Java") # TODO could at least do JS
if type("")==type(u""): # Python 3
def B(s):
try: return s.encode('latin1')
except: return s
def S(b):
try: return b.decode('latin1')
except: return b
def getBuf(f):
try: return f.buffer
except: return f
else: # Python 2: pass through as quickly as possible
def B(s): return s # (and as this particular script shouldn't need to run on a Python 2 below 2.7, we also use b"" inline for literals)
def S(s): return s
def getBuf(f): return f
if extra_js.startswith("@"):
f,extra_js=extra_js,b""
can_check_syntax = not os.system("which node 2>/dev/null >/dev/null")
for f in f[1:].split(','):
if ':' in f: f,fSR = f.split(':',1)
else: fSR=None
dat = open(f,"rb").read()
if fSR:
fSR = fSR.split(':')
for i in range(0,len(fSR),2):
if not B(fSR[i]) in dat: errExit("extra-js with search and replace: unable to find "+repr(fSR[i])+" in "+f)
dat = dat.replace(B(fSR[i]),B(fSR[i+1]))
if can_check_syntax:
out = err = True
if os.path.exists("/dev/shm"):
# node -c /dev/stdin can fail on some installations of GNU/Linux (but /dev/shm can fail on others, so try both)
fn="/dev/shm/"+str(os.getpid())+".js"
open(fn,"wb").write(dat)
out,err = subprocess.Popen("node -c "+fn,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate()
os.remove(fn)
if out or err:
out0,err0 = out,err
out,err = subprocess.Popen("node -c /dev/stdin",shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate(dat)
if (out or err) and not out0==True:
out,err = out0+out,err0+err
if out or err: errExit("Syntax check failed for extra-js file "+f+"\n"+"node stdout: "+repr(out)+"\nnode stderr: "+repr(err))
else: warn("No syntax checker available for "+f)
m=re.search(br"\([^)]*\)\s*=>\s*{",dat)
if m: errExit(f+" seems to have arrow function (breaks compatibility with Android 4.x): "+repr(m.group())) # TODO: also check for ||= (but not in comments; comments would need rm 1st); ||= requires Chrome 85
extra_js += dat ; del dat,fSR
if extra_js.rstrip() and not B(extra_js.rstrip()[-1:]) in b';}': errExit("--extra-js must end with a semicolon or a closing brace")
if existing_ruby_js_fixes.startswith("@"): existing_ruby_js_fixes = open(existing_ruby_js_fixes[1:],"rb").read()
if browser_extension and re.search("erHTML *=[^=]",existing_ruby_js_fixes): warn("Code in --existing-ruby-js-fixes that sets innerHTML or outerHTML might result in an extension that's not accepted by Firefox uploads")
jPackage = None
if java:
if not '//' in java: errExit("--java must include a // to separate the first part of the path from the package name")
jSrc,jRest=java.rsplit('//',1)
if '.' in jRest: errExit("--java must be ...src//org/example/package not ...src//org.example.package") # (TODO: fix it automatically in both jRest and java? only on the right-hand side of the //)
jPackage = jRest.replace('/','.')
if 'NewFunc' in jPackage: errExit("Currently unable to include the string 'NewFunc' in your package due to an implementation detail in annogen's search/replace operations")
if not c_filename and isatty(sys.stdout):
c_filename = tempfile.gettempdir()+os.sep+"annotator.c"
def shell_escape(arg):
if re.match("^[A-Za-z0-9_=/.%+,:@-]*$",arg): return arg
return "'"+arg.replace("'",r"'\''")+"'"
if sharp_multi:
if python: errExit("sharp-multi not yet implemented in Python") # TODO: easy enough
elif windows_clipboard: errExit("sharp-multi not yet implemented for windows-clipboard") # would need a way to select the annotator, probably necessitating a GUI on Windows
if java or javascript or python or dart:
def cOnly(param): errExit(param+" not yet implemented in any language other than C, so cannot be used with --java, --javascript, --python or --dart")
if windows_clipboard: cOnly("--windows-clipboard")
if library: cOnly("--library")
if not outcode=="utf-8": cOnly("Non utf-8 outcode")
if compress: cOnly("--compress")
if sum(1 for x in [java,javascript,python,dart] if x) > 1:
errExit("Outputting more than one programming language on the same run is not yet implemented")
if java:
if android and not "/src//" in java: errExit("When using --android, the last thing before the // in --java must be 'src' e.g. --java=/workspace/MyProject/src//org/example/package")
if not compile_only: # (delete previous files, only if we're not a subprocess)
os.system("mkdir -p "+shell_escape(java))
for f in os.listdir(java):
if f.endswith(".java") and f.startswith("z"): os.remove(java+os.sep+f)
c_filename = java+os.sep+"Annotator.java"
if android:
os.system("rm -rf "+shell_escape(jSrc+"/../bin")) # needed to get rid of old *.class files that might be no longer used
for d in ["assets","bin","gen","res/layout","res/menu","res/values","res/xml"]: os.system("mkdir -p "+shell_escape(jSrc+"/../"+d))
elif c_filename.endswith(".c"):
if javascript: c_filename = c_filename[:-2]+".js"
elif dart: c_filename = c_filename[:-2]+".dart"
else: c_filename = c_filename[:-2]+".py"
elif windows_clipboard:
if library: errExit("Support for having both --windows-clipboard and --library at the same time is not yet implemented") # ditto
if c_compiler=="cc -o annotator": c_compiler="i386-mingw32-gcc -o annoclip.exe"
if not outcode=="utf-8": errExit("outcode must be utf-8 when using --windows-clipboard")
elif library:
if c_compiler=="cc -o annotator": c_compiler="gcc -shared -fPIC -Wl,-soname,annotator.so.1 -o libannotator.so.1 -lc"
if js_6bit:
if not javascript: errExit("--js-6bit requires --javascript") # or just set js_6bit=False in these circumstances?
import urllib
if dart:
js_utf8 = not dart_datafile
if dart_datafile and any(x in dart_datafile for x in "'\\$"): errExit("Current implementation cannot cope with ' or \\ or $ in dart_datafile")
elif dart_datafile: errExit("--dart-datafile requires --dart")
if zlib:
if javascript: errExit("--zlib not supported with Javascript")
del zlib
try:
from zopfli import zlib # pip install zopfli
zlib._orig_compress = zlib.compress
zlib.compress = lambda s,level: zlib._orig_compress(s) # delete level
zlib_name = "zopfli"
except:
import zlib
zlib_name = "zlib"
if windows_clipboard: warn("--zlib with --windows-clipboard is inadvisable because ZLib is not typically present on Windows platforms. If you really want it, you'll need to figure out the compiler options and library setup for it.")
if dart and not dart_datafile: warn("--zlib without --dart-datafile might not be as efficient as you'd hope (and --zlib prevents the resulting Dart code from being compiled to a \"Web app\" anyway)") # as it requires dart:io
if rulesFile:
if not (read_rules or write_rules): errExit("rulesFile requires --read-rules or --write-rules")
elif read_rules and write_rules: errExit("--read-rules and --write-rules are mutually exclusive")
if priority_list: errExit("can't set both rulesFile and priority-list") # because PairPriorities uses corpus, not rules
elif read_rules or write_rules: errExit("--read-rules or --write-rules requires rulesFile")
if java or javascript or python or dart: c_compiler = None
try: xrange # Python 2
except: xrange,unichr,unicode = range,chr,str # Python 3
if post_normalise:
if not (javascript or java or freq_count): errExit('--post-normalise currently requires --javascript or --java (or --freq-count)')
if type("")==type(u""): # Python 3 (this requires 3.5+, TODO: support 3.3/3.4 ?)
import importlib.util as iu
s = iu.spec_from_file_location("post.normalise", post_normalise)
post_normalise = iu.module_from_spec(s) ; s.loader.exec_module(post_normalise)
else: # Python 2
import imp
post_normalise = imp.load_source('post.normalise', post_normalise)
post_normalise = post_normalise.table
for k,v in list(post_normalise.items()):
if not (k<=0xFFFF and v<=0xFFFF and len(unichr(k).encode('utf-8'))==len(unichr(v).encode('utf-8'))): del post_normalise[k] # BMP only for now, and only mappings that don't change UTF-8 length so inBytes / origInBytes are sync'd
elif k==v: del post_normalise[k] # don't need identity mappings
problems = set(post_normalise.keys()).intersection(set(post_normalise.values()))
if problems: errExit("--post-normalise table problem: both keys AND values have "+", ".join(hex(h) for h in sorted(list(problems))))
if type(u"")==type(""): post_normalise_translate = lambda x:x.translate(post_normalise) # Python 3 can use the dictionary as-is
else: post_normalise_translate = lambda u: u''.join(unichr(post_normalise.get(ord(i),ord(i))) for i in u) # as Python 2 .translate can take only len=256 (at least as documented; some versions can do more but not all tested), so we'd better write it out ourselves
try:
import locale
terminal_charset = locale.getpreferredencoding()
except: terminal_charset = None
if not terminal_charset: terminal_charset = "utf-8"
if existing_ruby_shortcut_yarowsky:
if not (android and ybytes and glossfile): errExit("--existing-ruby-shortcut-yarowsky makes sense only when generating an Android app with both ybytes and glossfile set")
def T(s):
if type(s)==type(u""): return s # Python 3
return s.decode(terminal_charset)
if keep_whitespace: keep_whitespace = set(T(keep_whitespace).split(','))
if ymax_limitwords: ymax_limitwords = set(T(ymax_limitwords).split(','))
if multiword_end_avoid: multiword_end_avoid = set(T(multiword_end_avoid).split(','))
if status_prefix: status_prefix += ": "
else: status_prefix = ""
if diagnose: diagnose=T(diagnose)
diagnose_limit = int(diagnose_limit)
max_words = int(max_words)
if single_words: max_words = 1
if read_rules and diagnose_manual: errExit("--diagnose-manual is not compatible with --read-rules")
suffix_minlen=int(suffix_minlen)
if compress:
squashStrings = set() ; squashReplacements = []
def squashFinish():
global squashStrings # so can set it to "done" at end
tokens = set()
for s in squashStrings: tokens.update(list(S(s)))
totSaved = 0
tokens = [chr(t) for t in range(1,256) if not chr(t) in tokens] ; orig_tokens = set(tokens)
pairs = [chr(0)] * 512
while tokens and squashStrings:
t = tokens.pop()
counts = {}
for s in squashStrings:
# To make decompression as fast and compact as possible, each 1-byte token represents 2 bytes exactly. In practice allowing it to represent variable lengths of whole bytes up to 4 is not likely to improve the compression by more than 3.2% (that's 3.2% of the 10-20% it achieves, so it's around 0.5%), and not very much better for length 9, so we might as well stick with this simpler scheme unless we do real LZMA or whatever.
for i in range(0,len(s)-1):
k = s[i:i+2]
if S(k[:1]) in orig_tokens or S(k[1:]) in orig_tokens: continue # to keep the decoder simple, don't set things up so it needs to recurse (being able to recurse within the 2-byte expansion is very unlikely to save anything in practice anyway - it didn't on my annotators - so not worth implementing the decoder for)
counts[k] = counts.get(k,0) + 1
bSaved, k = max((v,k) for k,v in counts.items())
pairs[ord(t)] = k[:1]
pairs[ord(t)+256] = k[1:]
squashReplacements.append((k,B(t))) # this assumes we won't be doing things like 'if ALL instances of a byte end up in our tokens, add the byte's original value as an extra token'
for s in squashStrings:
s2 = s.replace(k,B(t))
if not s2==s:
squashStrings.remove(s) ; squashStrings.add(s2)
totSaved += bSaved
sys.stderr.write("Compress: %d/%d tokens, %d bytes saved%s\r" % (len(orig_tokens)-len(tokens),len(orig_tokens),totSaved,clear_eol)) ; sys.stderr.flush()
squashStrings = "done"
while len(pairs) > 256 and pairs[-1]==chr(0): pairs = pairs[:-1]
sys.stderr.write("\n")
if totSaved < len(pairs)+50: sys.stderr.write("Warning: --compress on this data made it bigger! Consider dropping --compress\n") # 50 as rough guess for OutWriteDecompress binary (probably about 12 instructions at 4+ bytes each)
return c_escapeRawBytes(b"".join(B(p) for p in pairs))
decompress_func=br"""
static unsigned char pairs[]="%%PAIRS%%";
static void OutWriteDecompress(const char *s) {
while(*s) {
int i=(unsigned char)*s;
if (pairs[i]) { OutWriteByte(pairs[i]); OutWriteByte(pairs[i|0x100]); } else OutWriteByte(*s);
s++;
}
}"""
if sharp_multi: decompress_func += br"""
static int ns; static void OutWriteNSB(int b) {
if(b=='#') ns++; else if(ns==numSharps) OutWriteByte(b);
}
static void OutWriteDecompressP(const char *s) {
ns=0; while(*s && ns<=numSharps) {
int i=(unsigned char)*s;
if (pairs[i]) { OutWriteNSB(pairs[i]); OutWriteNSB(pairs[i|0x100]); } else OutWriteNSB(*s);
s++;
}
}"""
def squash(byteStr):
if squashStrings == "done":
for k,v in squashReplacements:
byteStr = byteStr.replace(k,v)
else: squashStrings.add(byteStr) # for the dry run
return byteStr
elif sharp_multi: decompress_func = br"""
static void OutWriteStrP(const char *annot) {
int ns = numSharps;
while(ns--) {
annot = strchr(annot,'#');
if (!annot) return; else annot++;
}
char* m = strchr(annot,'#');
if(m) OutWriteStrN(annot,m-annot); else OutWriteStr(annot);
}
"""
else: decompress_func = b""
def annotMap(varName="annotNo",mayNeedParen=False):
r = ""
if annotation_map:
for i in annotation_map.split(","):
k,v = i.split('=')
r += varName+"=="+str(int(k)-1)+"?"+str(int(v)-1)+":"
r += varName
if mayNeedParen and "==" in r: r="("+r+")"
return B(r)
if c_filename and os.sep in c_filename: cfn = c_filename[c_filename.rindex(os.sep)+1:]
else: cfn = c_filename
if library:
c_preamble = br"""
/*
This library is NOT thread safe. But you can use it
with single-threaded or multiprocess code like Web Adjuster
(not in WSGI mode).
To wrap this library in Python (2 or 3), you can do:
from ctypes import CDLL,c_char_p,c_int
alib = CDLL("./libannotator.so.1")
_annotate,_afree = alib.annotate,alib.afree
_annotate.restype = c_char_p
_annotate.argtypes = [c_char_p"""
if sharp_multi: c_preamble += b",c_int"
c_preamble += b",c_int]"
if outcode=="utf-8":
c_preamble += br"""
_annotateRL = alib.annotateRawLatinize
_annotateRL.restype = c_char_p
_annotateRL.argtypes = [c_char_p"""
if sharp_multi: c_preamble += b",c_int"
c_preamble += b"]\ndef annotR(txt"
if sharp_multi: c_preamble += b",aType=0"
c_preamble += br"""):
if type(txt)==type(u''): txt = txt.encode('utf-8')
r = _annotateRL(txt"""
if sharp_multi: c_preamble += b",aType"
c_preamble += br""")
_afree() ; return r"""
c_preamble += b"\ndef annotate(txt"
if sharp_multi: c_preamble += b",aType=0"
c_preamble += br""",aMode=1):
"aMode: 0 = raw, 1 = ruby (default), 2 = braces"
if type(txt)==type(u''): txt = txt.encode('"""+B(outcode)+br"""')
r = _annotate(txt"""
if sharp_multi: c_preamble += b",aType"
c_preamble += br""",aMode)
_afree() ; return r
# then for Web Adjuster you can do, for example,
# adjuster.annotFunc1 = lambda t:annotate(t"""
if sharp_multi: c_preamble += b",1"
c_preamble += b",1)\n"
if outcode=="utf-8":
if sharp_multi: c_preamble += b"# adjuster.annotFunc1R = lambda t:annotR(t,1)"
else: c_preamble += b"# adjuster.annotFunc1R = annotR"
c_preamble += br"""
# adjuster.options.htmlFilter = "*annotFunc1#*annotFunc1R"
# adjuster.options.htmlFilterName = "ruby#annot-only"
"""
else: c_preamble += br"""
# adjuster.options.htmlFilter = "*annotFunc1"
"""
if not outcode=="utf-8": c_preamble += br"""
# but BEWARE Web Adjuster assumes UTF-8; you'd better write a wrapper to re-code it
""" # (TODO: automate this?)
c_preamble += br"""
Compile with:
gcc -shared -fPIC -Wl,-soname,annotator.so.1 -o libannotator.so.1 annotator.c -lc
*/
"""
if cfn: c_preamble=c_preamble.replace(b"annotator.c",B(cfn))
c_preamble += br"""
#include <stdlib.h>
#include <string.h>
"""
c_defs = br"""static const unsigned char *readPtr, *writePtr, *startPtr;
static char *outBytes;
static size_t outWriteLen,outWritePtr;
#define NEXTBYTE (*readPtr++)
#define NEXT_COPY_BYTE (*writePtr++)
#define COPY_BYTE_SKIP writePtr++
#define COPY_BYTE_SKIPN(n) writePtr += (n)
#define POSTYPE const unsigned char*
#define THEPOS readPtr
#define SETPOS(p) (readPtr=(p))
#define PREVBYTE readPtr--
#define FINISHED (!(*readPtr))
static void OutWriteStrN(const char *s,size_t l) {
size_t newLen = outWriteLen;
while (outWritePtr+l > newLen) newLen *= 2;
if (newLen > outWriteLen) {
char *ob2 = realloc(outBytes,newLen);
if (!ob2) return; /* This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */
outBytes = ob2; outWriteLen = newLen;
}
memcpy(outBytes+outWritePtr, s, l);
outWritePtr += l;
}
static void OutWriteStr(const char *s) {
OutWriteStrN(s,strlen(s));
}
static void OutWriteByte(char c) {
if (outWritePtr >= outWriteLen) {
size_t newLen = outWriteLen * 2;
char *ob2 = realloc(outBytes,newLen);
if (!ob2) return; /* This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */
outBytes = ob2; outWriteLen = newLen;
}
outBytes[outWritePtr++] = c;
}
int near(char* string) {
const unsigned char *startFrom = readPtr-nearbytes,
*end = readPtr+nearbytes;
if (startFrom < startPtr) startFrom = startPtr;
size_t l=strlen(string); end -= l;
while (*startFrom && startFrom <= end) {
if(!strncmp(startFrom,string,l)) return 1;
startFrom++;
}
return 0;
}
void matchAll();"""
c_defs += br"""
void afree() { if(outBytes) free(outBytes); outBytes=NULL; }
char *annotate(const char *input"""
if sharp_multi: c_defs += b", int annotNo"
c_defs += br""",int aMode) {
readPtr=writePtr=startPtr=(char*)input;
outWriteLen = strlen(startPtr)*5+1; /* initial guess (must include the +1 to ensure it's non-0 for OutWrite...'s *= code) */
afree(); outBytes = malloc(outWriteLen);"""
if sharp_multi: c_defs += b" numSharps="+annotMap()+b";"
c_defs += br""" annotation_mode = aMode;
if(outBytes) { outWritePtr = 0; matchAll(); }
if(outBytes) OutWriteByte(0);
return outBytes;
}
"""
if outcode=="utf-8": # (TODO: document this feature? non-utf8 versions ??)
c_defs += br"""
static void latinizeMatch(); static int latCap,latSpace;
char *annotateRawLatinize(const char *input"""
if sharp_multi: c_defs += b", int annotNo"
c_defs += br""") {
// "Bonus" library function, works only if annotation is Latin-like,
// tries to improve the capitalisation when in 'raw' mode
// (TODO: make this available in other annogen output formats? work into ruby mode??)
char *tmp=annotate(input"""
if sharp_multi: c_defs += b",annotNo"
c_defs += br""",annotations_only);
if(tmp) { tmp=strdup(tmp); if(tmp) {
readPtr=writePtr=startPtr=tmp;
afree(); outBytes=malloc(outWriteLen);
if(outBytes) {
outWritePtr = 0; latCap=1; latSpace=0;
while(!FINISHED) {
POSTYPE oldPos=THEPOS;
latinizeMatch();
if (oldPos==THEPOS) { OutWriteByte(NEXTBYTE); COPY_BYTE_SKIP; }
}
}
if(outBytes) OutWriteByte(0);
free(tmp);
} } return(outBytes);
}
static inline void doLatSpace() {
if(latSpace) {
OutWriteByte(' ');
latSpace = 0;
}
}
static void latinizeMatch() {
POSTYPE oldPos=THEPOS;
int nb = NEXTBYTE;
if (latCap || latSpace) {
if (nb >= '0' && nb <= '9') latSpace = 0; /* 1:1 */
else if(nb >= 'A' && nb <= 'Z') {
latCap = 0; doLatSpace();
} else if(nb >= 'a' && nb <= 'z') {
doLatSpace();
if(latCap) {
latCap = 0;
OutWriteByte(nb-('a'-'A')); return;
}
} else switch(nb) {
case 0xC3:
{ int nb2 = NEXTBYTE;
switch(nb2) {
case 0x80: case 0x81: case 0x88: case 0x89:
case 0x8c: case 0x8d: case 0x92: case 0x93:
case 0x99: case 0x9a:
doLatSpace();
latCap=0; break;
case 0xa0: case 0xa1: case 0xa8: case 0xa9:
case 0xac: case 0xad: case 0xb2: case 0xb3:
case 0xb9: case 0xba:
doLatSpace();
if (latCap) {
OutWriteByte(0xC3);
OutWriteByte(nb2-0x20); latCap=0; return;
}
} break; }
case 0xC4:
{ int nb2 = NEXTBYTE;
switch(nb2) {
case 0x80: case 0x92: case 0x9a: case 0xaa:
doLatSpace();
latCap=0; break;
case 0x81: case 0x93: case 0x9b: case 0xab:
doLatSpace();
if (latCap) {
OutWriteByte(0xC4);
OutWriteByte(nb2-1); latCap=0; return;
}
} break; }
case 0xC5:
{ int nb2 = NEXTBYTE;
switch(nb2) {
case 0x8c: case 0xaa:
doLatSpace();
latCap=0; break;
case 0x8d: case 0xab:
doLatSpace();
if (latCap) {
OutWriteByte(0xC5);
OutWriteByte(nb2-1); latCap=0; return;
}
} break; }
case 0xC7:
{ int nb2 = NEXTBYTE;
switch(nb2) {
case 0x8d: case 0x8f: case 0x91: case 0x93:
case 0x95: case 0x97: case 0x99: case 0x9b:
doLatSpace();
latCap=0; break;
case 0x8e: case 0x90: case 0x92: case 0x94:
case 0x96: case 0x98: case 0x9a: case 0x9c:
doLatSpace();
if (latCap) {
OutWriteByte(0xC7);
OutWriteByte(nb2-1); latCap=0; return;
}
} break; }
}
}
switch(nb) {
case 0xE2: /* could be opening quote */
if(NEXTBYTE==0x80) switch(NEXTBYTE) {
case 0x98: case 0x9c:
OutWriteByte(' '); latSpace = 0;
}
break;
case 0xE3: /* could be Chinese stop or list-comma */
if(NEXTBYTE==0x80) switch(NEXTBYTE) {
case 0x81:
OutWriteByte(','); latSpace = 1; return;
case 0x82:
OutWriteByte('.'); latSpace = 1;
latCap=1; return;
} break;
case 0xEF: /* could be full-width ascii */
switch(NEXTBYTE) {
case 0xBC:
{
int b=NEXTBYTE;
if (b >= 0x81 && b <= 0xbf) {
int punc = b-(0x81-'!');
switch(punc) {
case '(': OutWriteByte(' '); latSpace = 0;
}
OutWriteByte(punc);
if (punc >= 0x90 && punc <= 0x99) latSpace = 0;
else switch(punc) {
case '!': case '.': case '?':
latCap = 1; /* fall through */
case ')': case ',':
case ':': case ';':
latSpace = 1;
}
return;
}
break;
}
case 0xBD:
{
int b=NEXTBYTE;
if (b >= 0x80 && b <= 0x9d) {
/* TODO: capitalise if it's a letter (but probably not needed in most annotations) */
OutWriteByte(b-(0x80-'`')); return;
}
} break;
} break;
}
SETPOS(oldPos);
}
"""
have_annotModes = library # only ruby is needed by the Android code
elif windows_clipboard:
c_preamble = br"""/*
For running on Windows desktop or WINE, compile with:
i386-mingw32-gcc annoclip.c -o annoclip.exe
For running on Windows Mobile 2003SE, 5, 6, 6.1 or 6.5,
compile with:
arm-cegcc-gcc annoclip.c -D_WINCE -Os -o annoclip-WM.exe
or (if you have MSVC 2008 on a Windows machine),
set PATH=%VCINSTALLDIR%\ce\bin\x86_arm;%PATH%
set lib=%VCINSTALLDIR%\ce\lib\armv4
set include=%VSINSTALLDIR%\SmartDevices\SDK\Smartphone2003\Include;%VCINSTALLDIR%\ce\include;%VCINSTALLDIR%\include
set CL=/TP /EHsc /D "_WIN32_WCE=0x420" /D UNDER_CE /D WIN32_PLATFORM_PSPC /D _WINCE /D _WINDOWS /D ARM /D _ARM_ /D _UNICODE /D UNICODE /D POCKETPC2003_UI_MODEL
set LINK=/force:multiple /NODEFAULTLIB:oldnames.lib /SUBSYSTEM:WINDOWSCE /LIBPATH:"C:\Program Files\Windows Mobile 5.0 SDK R2\PocketPC\Lib\ARMV4I" /OUT:annoclip-WM.exe /MANIFEST:NO /STACK:65536,4096 /DYNAMICBASE:NO aygshell.lib coredll.lib corelibc.lib ole32.lib oleaut32.lib uuid.lib commctrl.lib
cl /D_WIN32_IE=0x0400 /D_WIN32_WCE=0x0400 /Os /Og annoclip.c
(you could try omitting /Os /Og for faster compilation,
but RAM is likely important on the Windows Mobile device)
*/