-
Notifications
You must be signed in to change notification settings - Fork 7
/
bidi.c
1685 lines (1549 loc) · 54 KB
/
bidi.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Implementation of the Unicode bidirectional and Arabic shaping
* algorithms for PuTTY.
*
* Original version written and kindly contributed to this code base
* by Ahmad Khalifa of Arabeyes. The bidi part was almost completely
* rewritten in 2021 by Simon Tatham to bring it up to date, but the
* shaping part is still the one by the original authors.
*
* Implementation notes:
*
* Algorithm version
* -----------------
*
* This algorithm is up to date with Unicode Standard Annex #9
* revision 46:
*
* https://www.unicode.org/reports/tr9/tr9-46.html
*
* and passes the full conformance test suite in Unicode 15.0.0.
*
* Paragraph and line handling
* ---------------------------
*
* The full Unicode bidi algorithm expects to receive text containing
* multiple paragraphs, together with a decision about how those
* paragraphs are broken up into lines. It calculates embedding levels
* a whole paragraph at a time without considering the line breaks,
* but then the final reordering of the text for display is done to
* each _line_ independently based on the levels computed for the text
* in that line.
*
* This algorithm omits all of that, because it's intended for use as
* a display-time transformation of a text terminal, which doesn't
* preserve enough semantic information to decide what's a paragraph
* break and what is not. So a piece of input text provided to this
* algorithm is always expected to consist of exactly one paragraph
* *and* exactly one line.
*
* Embeddings, overrides and isolates
* ----------------------------------
*
* This implementation has full support for all the Unicode special
* control characters that modify bidi behaviour, such as
*
* U+202A LEFT-TO-RIGHT EMBEDDING
* U+202B RIGHT-TO-LEFT EMBEDDING
* U+202D LEFT-TO-RIGHT OVERRIDE
* U+202E RIGHT-TO-LEFT OVERRIDE
* U+202C POP DIRECTIONAL FORMATTING
* U+2068 FIRST STRONG ISOLATE
* U+2066 LEFT-TO-RIGHT ISOLATE
* U+2067 RIGHT-TO-LEFT ISOLATE
* U+2069 POP DIRECTIONAL ISOLATE
*
* However, at present, the terminal emulator that is a client of this
* code has no way to pass those in (because they're dropped during
* escape sequence processing and don't get stored in the terminal
* state). Nonetheless, the code is all here, so if the terminal
* emulator becomes able to record those characters at some later
* point, we'll be all set to take account of them during bidi.
*
* But the _main_ purpose of supporting the full bidi algorithm is
* simply that that's the easiest way to be sure it's correct, because
* if you support the whole thing, you can run the full conformance
* test suite. (And I don't 100% believe that restricting to the
* subset of _tests_ valid with a reduced character set will test the
* full set of _functionality_ relevant to the reduced set.)
*
* Retained formatting characters
* ------------------------------
*
* The standard bidi algorithm, in step X9, deletes assorted
* formatting characters from the text: all the embedding and override
* section initiator characters, the Pop Directional Formatting
* character that closes one of those sections again, and any
* character labelled as Boundary Neutral. So the characters it
* returns are not a _full_ reordering of the input; some input
* characters vanish completely.
*
* This would be fine, if it were not for the fact that - as far as I
* can see - _exactly one_ Unicode code point in the discarded
* category has a wcwidth() of more than 0, namely U+00AD SOFT HYPHEN
* which is a printing character for terminal purposes but has a bidi
* class of BN.
*
* Therefore, we must implement a modified version of the algorithm,
* as described in section 5.2 of TR9, which retains those formatting
* characters so that a client can find out where they ended up in the
* reordering.
*
* Section 5.2 describes a set of modifications to the algorithm that
* are _intended_ to achieve this without changing the rest of the
* behaviour: that is, if you take the output of the modified
* algorithm and delete all the characters that the standard algorithm
* would have removed, you should end up with the remaining characters
* in the same order that the standard algorithm would have delivered.
* However, section 5.2 admits the possibility of error, and says "in
* case of any deviation the explicit algorithm is the normative
* statement for conformance". And indeed, in one or two places I
* found I had to make my own tweaks to the section 5.2 description in
* order to get the whole test suite to pass, because I think the 5.2
* modifications if taken literally don't quite achieve that. My
* justification is that sentence of 5.2: in case of doubt, the right
* thing is to make the code behave the same as the official
* algorithm.
*
* It's possible that there might still be some undiscovered
* discrepancies between the behaviour of the standard and modified
* algorithms. So, just in case, I've kept in this code the ability to
* implement the _standard_ algorithm too! If you compile with
* -DREMOVE_FORMATTING_CHARS, this code should go back to implementing
* the literal UAX#9 bidi algorithm - so you can run your suspect
* input through both versions, making it much easier to figure out
* why they differ, and in which of the many stages of the algorithm
* the difference was introduced.
*
* However, beware that when compiling in this mode, the do_bidi
* interface to the terminal will stop working, and just abort() when
* called! The only useful thing you can do with this mode is to run
* the companion program bidi_test.c.
*/
#include <stdlib.h> /* definition of wchar_t */
#include "putty.h"
#include "misc.h"
#include "bidi.h"
typedef struct {
char type;
wchar_t form_b;
} shape_node;
/* Kept near the actual table, for verification. */
#define SHAPE_FIRST 0x621
#define SHAPE_LAST (SHAPE_FIRST + lenof(shapetypes) - 1)
static const shape_node shapetypes[] = {
/* index, Typ, Iso, Ligature Index*/
/* 621 */ {SU, 0xFE80},
/* 622 */ {SR, 0xFE81},
/* 623 */ {SR, 0xFE83},
/* 624 */ {SR, 0xFE85},
/* 625 */ {SR, 0xFE87},
/* 626 */ {SD, 0xFE89},
/* 627 */ {SR, 0xFE8D},
/* 628 */ {SD, 0xFE8F},
/* 629 */ {SR, 0xFE93},
/* 62A */ {SD, 0xFE95},
/* 62B */ {SD, 0xFE99},
/* 62C */ {SD, 0xFE9D},
/* 62D */ {SD, 0xFEA1},
/* 62E */ {SD, 0xFEA5},
/* 62F */ {SR, 0xFEA9},
/* 630 */ {SR, 0xFEAB},
/* 631 */ {SR, 0xFEAD},
/* 632 */ {SR, 0xFEAF},
/* 633 */ {SD, 0xFEB1},
/* 634 */ {SD, 0xFEB5},
/* 635 */ {SD, 0xFEB9},
/* 636 */ {SD, 0xFEBD},
/* 637 */ {SD, 0xFEC1},
/* 638 */ {SD, 0xFEC5},
/* 639 */ {SD, 0xFEC9},
/* 63A */ {SD, 0xFECD},
/* 63B */ {SU, 0x0},
/* 63C */ {SU, 0x0},
/* 63D */ {SU, 0x0},
/* 63E */ {SU, 0x0},
/* 63F */ {SU, 0x0},
/* 640 */ {SC, 0x0},
/* 641 */ {SD, 0xFED1},
/* 642 */ {SD, 0xFED5},
/* 643 */ {SD, 0xFED9},
/* 644 */ {SD, 0xFEDD},
/* 645 */ {SD, 0xFEE1},
/* 646 */ {SD, 0xFEE5},
/* 647 */ {SD, 0xFEE9},
/* 648 */ {SR, 0xFEED},
/* 649 */ {SR, 0xFEEF}, /* SD */
/* 64A */ {SD, 0xFEF1},
/* 64B */ {SU, 0x0},
/* 64C */ {SU, 0x0},
/* 64D */ {SU, 0x0},
/* 64E */ {SU, 0x0},
/* 64F */ {SU, 0x0},
/* 650 */ {SU, 0x0},
/* 651 */ {SU, 0x0},
/* 652 */ {SU, 0x0},
/* 653 */ {SU, 0x0},
/* 654 */ {SU, 0x0},
/* 655 */ {SU, 0x0},
/* 656 */ {SU, 0x0},
/* 657 */ {SU, 0x0},
/* 658 */ {SU, 0x0},
/* 659 */ {SU, 0x0},
/* 65A */ {SU, 0x0},
/* 65B */ {SU, 0x0},
/* 65C */ {SU, 0x0},
/* 65D */ {SU, 0x0},
/* 65E */ {SU, 0x0},
/* 65F */ {SU, 0x0},
/* 660 */ {SU, 0x0},
/* 661 */ {SU, 0x0},
/* 662 */ {SU, 0x0},
/* 663 */ {SU, 0x0},
/* 664 */ {SU, 0x0},
/* 665 */ {SU, 0x0},
/* 666 */ {SU, 0x0},
/* 667 */ {SU, 0x0},
/* 668 */ {SU, 0x0},
/* 669 */ {SU, 0x0},
/* 66A */ {SU, 0x0},
/* 66B */ {SU, 0x0},
/* 66C */ {SU, 0x0},
/* 66D */ {SU, 0x0},
/* 66E */ {SU, 0x0},
/* 66F */ {SU, 0x0},
/* 670 */ {SU, 0x0},
/* 671 */ {SR, 0xFB50},
/* 672 */ {SU, 0x0},
/* 673 */ {SU, 0x0},
/* 674 */ {SU, 0x0},
/* 675 */ {SU, 0x0},
/* 676 */ {SU, 0x0},
/* 677 */ {SU, 0x0},
/* 678 */ {SU, 0x0},
/* 679 */ {SD, 0xFB66},
/* 67A */ {SD, 0xFB5E},
/* 67B */ {SD, 0xFB52},
/* 67C */ {SU, 0x0},
/* 67D */ {SU, 0x0},
/* 67E */ {SD, 0xFB56},
/* 67F */ {SD, 0xFB62},
/* 680 */ {SD, 0xFB5A},
/* 681 */ {SU, 0x0},
/* 682 */ {SU, 0x0},
/* 683 */ {SD, 0xFB76},
/* 684 */ {SD, 0xFB72},
/* 685 */ {SU, 0x0},
/* 686 */ {SD, 0xFB7A},
/* 687 */ {SD, 0xFB7E},
/* 688 */ {SR, 0xFB88},
/* 689 */ {SU, 0x0},
/* 68A */ {SU, 0x0},
/* 68B */ {SU, 0x0},
/* 68C */ {SR, 0xFB84},
/* 68D */ {SR, 0xFB82},
/* 68E */ {SR, 0xFB86},
/* 68F */ {SU, 0x0},
/* 690 */ {SU, 0x0},
/* 691 */ {SR, 0xFB8C},
/* 692 */ {SU, 0x0},
/* 693 */ {SU, 0x0},
/* 694 */ {SU, 0x0},
/* 695 */ {SU, 0x0},
/* 696 */ {SU, 0x0},
/* 697 */ {SU, 0x0},
/* 698 */ {SR, 0xFB8A},
/* 699 */ {SU, 0x0},
/* 69A */ {SU, 0x0},
/* 69B */ {SU, 0x0},
/* 69C */ {SU, 0x0},
/* 69D */ {SU, 0x0},
/* 69E */ {SU, 0x0},
/* 69F */ {SU, 0x0},
/* 6A0 */ {SU, 0x0},
/* 6A1 */ {SU, 0x0},
/* 6A2 */ {SU, 0x0},
/* 6A3 */ {SU, 0x0},
/* 6A4 */ {SD, 0xFB6A},
/* 6A5 */ {SU, 0x0},
/* 6A6 */ {SD, 0xFB6E},
/* 6A7 */ {SU, 0x0},
/* 6A8 */ {SU, 0x0},
/* 6A9 */ {SD, 0xFB8E},
/* 6AA */ {SU, 0x0},
/* 6AB */ {SU, 0x0},
/* 6AC */ {SU, 0x0},
/* 6AD */ {SD, 0xFBD3},
/* 6AE */ {SU, 0x0},
/* 6AF */ {SD, 0xFB92},
/* 6B0 */ {SU, 0x0},
/* 6B1 */ {SD, 0xFB9A},
/* 6B2 */ {SU, 0x0},
/* 6B3 */ {SD, 0xFB96},
/* 6B4 */ {SU, 0x0},
/* 6B5 */ {SU, 0x0},
/* 6B6 */ {SU, 0x0},
/* 6B7 */ {SU, 0x0},
/* 6B8 */ {SU, 0x0},
/* 6B9 */ {SU, 0x0},
/* 6BA */ {SR, 0xFB9E},
/* 6BB */ {SD, 0xFBA0},
/* 6BC */ {SU, 0x0},
/* 6BD */ {SU, 0x0},
/* 6BE */ {SD, 0xFBAA},
/* 6BF */ {SU, 0x0},
/* 6C0 */ {SR, 0xFBA4},
/* 6C1 */ {SD, 0xFBA6},
/* 6C2 */ {SU, 0x0},
/* 6C3 */ {SU, 0x0},
/* 6C4 */ {SU, 0x0},
/* 6C5 */ {SR, 0xFBE0},
/* 6C6 */ {SR, 0xFBD9},
/* 6C7 */ {SR, 0xFBD7},
/* 6C8 */ {SR, 0xFBDB},
/* 6C9 */ {SR, 0xFBE2},
/* 6CA */ {SU, 0x0},
/* 6CB */ {SR, 0xFBDE},
/* 6CC */ {SD, 0xFBFC},
/* 6CD */ {SU, 0x0},
/* 6CE */ {SU, 0x0},
/* 6CF */ {SU, 0x0},
/* 6D0 */ {SU, 0x0},
/* 6D1 */ {SU, 0x0},
/* 6D2 */ {SR, 0xFBAE},
};
/*
* Returns the bidi character type of ch.
*/
unsigned char bidi_getType(int ch)
{
static const struct {
int first, last, type;
} lookup[] = {
#include "unicode/bidi_type.h"
};
int i, j, k;
i = -1;
j = lenof(lookup);
while (j - i > 1) {
k = (i + j) / 2;
if (ch < lookup[k].first)
j = k;
else if (ch > lookup[k].last)
i = k;
else
return lookup[k].type;
}
/*
* If we reach here, the character was not in any of the
* intervals listed in the lookup table. This means we return
* ON (`Other Neutrals'). This is the appropriate code for any
* character genuinely not listed in the Unicode table, and
* also the table above has deliberately left out any
* characters _explicitly_ listed as ON (to save space!).
*/
return ON;
}
/*
* Return the mirrored version of a glyph.
*
* FIXME: there are also glyphs which the text rendering engine is
* supposed to display left-right reflected, since no mirrored glyph
* exists in Unicode itself to indicate the reflected form. Those are
* listed in comments in BidiMirroring.txt. Many of them are
* mathematical, e.g. the square root sign, or set difference
* operator, or integral sign. No API currently exists here to
* communicate the need for that reflected display back to the client.
*/
static unsigned mirror_glyph(unsigned int ch)
{
static const struct {
unsigned src, dst;
} mirror_pairs[] = {
#include "unicode/bidi_mirror.h"
};
int i, j, k;
i = -1;
j = lenof(mirror_pairs);
while (j - i > 1) {
k = (i + j) / 2;
if (ch < mirror_pairs[k].src)
j = k;
else if (ch > mirror_pairs[k].src)
i = k;
else
return mirror_pairs[k].dst;
}
return ch;
}
/*
* Identify the bracket characters treated specially by bidi rule
* BD19, and return their paired character(s).
*/
typedef enum { BT_NONE, BT_OPEN, BT_CLOSE } BracketType;
typedef struct BracketTypeData {
unsigned partner, equiv_partner;
BracketType type;
} BracketTypeData;
static BracketTypeData bracket_type(unsigned int ch)
{
static const struct {
unsigned src;
BracketTypeData payload;
} bracket_pairs[] = {
#include "unicode/bidi_brackets.h"
};
int i, j, k;
i = -1;
j = lenof(bracket_pairs);
while (j - i > 1) {
k = (i + j) / 2;
if (ch < bracket_pairs[k].src) {
j = k;
} else if (ch > bracket_pairs[k].src) {
i = k;
} else {
return bracket_pairs[k].payload;
}
}
static const BracketTypeData null = { 0, 0, BT_NONE };
return null;
}
/*
* Function exported to front ends to allow them to identify
* bidi-active characters (in case, for example, the platform's
* text display function can't conveniently be prevented from doing
* its own bidi and so special treatment is required for characters
* that would cause the bidi algorithm to activate).
*
* This function is passed a single Unicode code point, and returns
* nonzero if the presence of this code point can possibly cause
* the bidi algorithm to do any reordering. Thus, any string
* composed entirely of characters for which is_rtl() returns zero
* should be safe to pass to a bidi-active platform display
* function without fear.
*
* (is_rtl() must therefore also return true for any character
* which would be affected by Arabic shaping, but this isn't
* important because all such characters are right-to-left so it
* would have flagged them anyway.)
*/
bool is_rtl(int c)
{
return typeIsBidiActive(bidi_getType(c));
}
/* The Main shaping function, and the only one to be used
* by the outside world.
*
* line: buffer to apply shaping to. this must be passed by doBidi() first
* to: output buffer for the shaped data
* count: number of characters in line
*/
int do_shape(bidi_char *line, bidi_char *to, int count)
{
int i, tempShape;
bool ligFlag = false;
for (i=0; i<count; i++) {
to[i] = line[i];
tempShape = STYPE(line[i].wc);
switch (tempShape) {
case SC:
break;
case SU:
break;
case SR:
tempShape = (i+1 < count ? STYPE(line[i+1].wc) : SU);
if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
to[i].wc = SFINAL((SISOLATED(line[i].wc)));
else
to[i].wc = SISOLATED(line[i].wc);
break;
case SD:
/* Make Ligatures */
tempShape = (i+1 < count ? STYPE(line[i+1].wc) : SU);
if (line[i].wc == 0x644) {
if (i > 0) switch (line[i-1].wc) {
case 0x622:
ligFlag = true;
if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
to[i].wc = 0xFEF6;
else
to[i].wc = 0xFEF5;
break;
case 0x623:
ligFlag = true;
if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
to[i].wc = 0xFEF8;
else
to[i].wc = 0xFEF7;
break;
case 0x625:
ligFlag = true;
if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
to[i].wc = 0xFEFA;
else
to[i].wc = 0xFEF9;
break;
case 0x627:
ligFlag = true;
if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
to[i].wc = 0xFEFC;
else
to[i].wc = 0xFEFB;
break;
}
if (ligFlag) {
to[i-1].wc = 0x20;
ligFlag = false;
break;
}
}
if ((tempShape == SL) || (tempShape == SD) || (tempShape == SC)) {
tempShape = (i > 0 ? STYPE(line[i-1].wc) : SU);
if ((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
to[i].wc = SMEDIAL((SISOLATED(line[i].wc)));
else
to[i].wc = SFINAL((SISOLATED(line[i].wc)));
break;
}
tempShape = (i > 0 ? STYPE(line[i-1].wc) : SU);
if ((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
to[i].wc = SINITIAL((SISOLATED(line[i].wc)));
else
to[i].wc = SISOLATED(line[i].wc);
break;
}
}
return 1;
}
typedef enum { DO_NEUTRAL, DO_LTR, DO_RTL } DirectionalOverride;
typedef struct DSStackEntry {
/*
* An entry in the directional status stack (rule section X).
*/
unsigned char level;
bool isolate;
DirectionalOverride override;
} DSStackEntry;
typedef struct BracketStackEntry {
/*
* An entry in the bracket-pair-tracking stack (rule BD16).
*/
unsigned ch;
size_t c;
} BracketStackEntry;
typedef struct IsolatingRunSequence {
size_t start, end;
BidiType sos, eos, embeddingDirection;
} IsolatingRunSequence;
#define MAX_DEPTH 125 /* specified in the standard */
struct BidiContext {
/*
* Storage space preserved between runs, all allocated to the same
* length (internal_array_sizes).
*/
size_t internal_array_sizes;
BidiType *types, *origTypes;
unsigned char *levels;
size_t *irsindices, *bracketpos;
bool *irsdone;
/*
* Separately allocated with its own size field
*/
IsolatingRunSequence *irslist;
size_t irslistsize;
/*
* Rewritten to point to the input to the currently active run of
* the bidi algorithm
*/
bidi_char *text;
size_t textlen;
/*
* State within a run of the algorithm
*/
BidiType paragraphOverride;
DSStackEntry dsstack[MAX_DEPTH + 2];
size_t ds_sp;
size_t overflowIsolateCount, overflowEmbeddingCount, validIsolateCount;
unsigned char paragraphLevel;
size_t *irs;
size_t irslen;
BidiType sos, eos, embeddingDirection;
BracketStackEntry bstack[63]; /* constant size specified in rule BD16 */
};
BidiContext *bidi_new_context(void)
{
BidiContext *ctx = snew(BidiContext);
memset(ctx, 0, sizeof(BidiContext));
return ctx;
}
void bidi_free_context(BidiContext *ctx)
{
sfree(ctx->types);
sfree(ctx->origTypes);
sfree(ctx->levels);
sfree(ctx->irsindices);
sfree(ctx->irsdone);
sfree(ctx->bracketpos);
sfree(ctx->irslist);
sfree(ctx);
}
static void ensure_arrays(BidiContext *ctx, size_t textlen)
{
if (textlen <= ctx->internal_array_sizes)
return;
ctx->internal_array_sizes = textlen;
ctx->types = sresize(ctx->types, ctx->internal_array_sizes, BidiType);
ctx->origTypes = sresize(ctx->origTypes, ctx->internal_array_sizes,
BidiType);
ctx->levels = sresize(ctx->levels, ctx->internal_array_sizes,
unsigned char);
ctx->irsindices = sresize(ctx->irsindices, ctx->internal_array_sizes,
size_t);
ctx->irsdone = sresize(ctx->irsdone, ctx->internal_array_sizes, bool);
ctx->bracketpos = sresize(ctx->bracketpos, ctx->internal_array_sizes,
size_t);
}
static void setup_types(BidiContext *ctx)
{
for (size_t i = 0; i < ctx->textlen; i++)
ctx->types[i] = ctx->origTypes[i] = bidi_getType(ctx->text[i].wc);
}
static bool text_needs_bidi(BidiContext *ctx)
{
/*
* Initial optimisation: check for any bidi-active character at
* all in an input line. If there aren't any, we can skip the
* whole algorithm.
*
* Also include the paragraph override in this check!
*/
for (size_t i = 0; i < ctx->textlen; i++)
if (typeIsBidiActive(ctx->types[i]))
return true;
return typeIsBidiActive(ctx->paragraphOverride);
}
static size_t find_matching_pdi(const BidiType *types, size_t i, size_t size)
{
/* Assuming that types[i] is an isolate initiator, find its
* matching PDI by rule BD9. */
unsigned counter = 1;
i++;
for (; i < size; i++) {
BidiType t = types[i];
if (typeIsIsolateInitiator(t)) {
counter++;
} else if (t == PDI) {
counter--;
if (counter == 0)
return i;
}
}
/* If no PDI was found, return the length of the array. */
return size;
}
static unsigned char rule_p2_p3(const BidiType *types, size_t size)
{
/*
* Rule P2. Find the first strong type (L, R or AL), ignoring
* anything inside an isolated segment.
*
* Rule P3. If that type is R or AL, choose a paragraph embeddding
* level of 1, otherwise 0.
*/
for (size_t i = 0; i < size; i++) {
BidiType t = types[i];
if (typeIsIsolateInitiator(t))
i = find_matching_pdi(types, i, size);
else if (typeIsStrong(t))
return (t == L ? 0 : 1);
}
return 0; /* default if no strong type found */
}
static void set_paragraph_level(BidiContext *ctx)
{
if (ctx->paragraphOverride == L)
ctx->paragraphLevel = 0;
else if (ctx->paragraphOverride == R)
ctx->paragraphLevel = 1;
else
ctx->paragraphLevel = rule_p2_p3(ctx->types, ctx->textlen);
}
static inline unsigned char nextOddLevel(unsigned char x) { return (x+1)|1; }
static inline unsigned char nextEvenLevel(unsigned char x) { return (x|1)+1; }
static inline void push(BidiContext *ctx, unsigned char level,
DirectionalOverride override, bool isolate)
{
ctx->ds_sp++;
assert(ctx->ds_sp < lenof(ctx->dsstack));
ctx->dsstack[ctx->ds_sp].level = level;
ctx->dsstack[ctx->ds_sp].override = override;
ctx->dsstack[ctx->ds_sp].isolate = isolate;
}
static inline void pop(BidiContext *ctx)
{
assert(ctx->ds_sp > 0);
ctx->ds_sp--;
}
static void process_explicit_embeddings(BidiContext *ctx)
{
/*
* Rule X1 initialisation.
*/
ctx->ds_sp = (size_t)-1;
push(ctx, ctx->paragraphLevel, DO_NEUTRAL, false);
ctx->overflowIsolateCount = 0;
ctx->overflowEmbeddingCount = 0;
ctx->validIsolateCount = 0;
#define stk (&ctx->dsstack[ctx->ds_sp])
for (size_t i = 0; i < ctx->textlen; i++) {
BidiType t = ctx->types[i];
switch (t) {
case RLE: case LRE: case RLO: case LRO: {
/* Rules X2-X5 */
unsigned char newLevel;
DirectionalOverride override;
#ifndef REMOVE_FORMATTING_CHARS
ctx->levels[i] = stk->level;
#endif
switch (t) {
case RLE: /* rule X2 */
newLevel = nextOddLevel(stk->level);
override = DO_NEUTRAL;
break;
case LRE: /* rule X3 */
newLevel = nextEvenLevel(stk->level);
override = DO_NEUTRAL;
break;
case RLO: /* rule X4 */
newLevel = nextOddLevel(stk->level);
override = DO_RTL;
break;
case LRO: /* rule X5 */
newLevel = nextEvenLevel(stk->level);
override = DO_LTR;
break;
default:
unreachable("how did this get past the outer switch?");
}
if (newLevel <= MAX_DEPTH &&
ctx->overflowIsolateCount == 0 &&
ctx->overflowEmbeddingCount == 0) {
/* Embedding code is valid. Push a stack entry. */
push(ctx, newLevel, override, false);
} else {
/* Embedding code is an overflow one. */
if (ctx->overflowIsolateCount == 0)
ctx->overflowEmbeddingCount++;
}
break;
}
case RLI: case LRI: case FSI: {
/* Rules X5a, X5b, X5c */
if (t == FSI) {
/* Rule X5c: decide whether this should be treated
* like RLI or LRI */
size_t pdi = find_matching_pdi(ctx->types, i, ctx->textlen);
unsigned char level = rule_p2_p3(ctx->types + (i + 1),
pdi - (i + 1));
t = (level == 1 ? RLI : LRI);
}
ctx->levels[i] = stk->level;
if (stk->override != DO_NEUTRAL)
ctx->types[i] = (stk->override == DO_LTR ? L :
stk->override == DO_RTL ? R : t);
unsigned char newLevel = (t == RLI ? nextOddLevel(stk->level) :
nextEvenLevel(stk->level));
if (newLevel <= MAX_DEPTH &&
ctx->overflowIsolateCount == 0 &&
ctx->overflowEmbeddingCount == 0) {
/* Isolate code is valid. Push a stack entry. */
push(ctx, newLevel, DO_NEUTRAL, true);
ctx->validIsolateCount++;
} else {
/* Isolate code is an overflow one. */
ctx->overflowIsolateCount++;
}
break;
}
case PDI: {
/* Rule X6a */
if (ctx->overflowIsolateCount > 0) {
ctx->overflowIsolateCount--;
} else if (ctx->validIsolateCount == 0) {
/* Do nothing: spurious isolate-pop */
} else {
/* Valid isolate-pop. We expect that the stack must
* therefore contain at least one isolate==true entry,
* so pop everything up to and including it. */
ctx->overflowEmbeddingCount = 0;
while (!stk->isolate)
pop(ctx);
pop(ctx);
ctx->validIsolateCount--;
}
ctx->levels[i] = stk->level;
if (stk->override != DO_NEUTRAL)
ctx->types[i] = (stk->override == DO_LTR ? L : R);
break;
}
case PDF: {
/* Rule X7 */
if (ctx->overflowIsolateCount > 0) {
/* Do nothing if we've overflowed on isolates */
} else if (ctx->overflowEmbeddingCount > 0) {
ctx->overflowEmbeddingCount--;
} else if (ctx->ds_sp > 0 && !stk->isolate) {
pop(ctx);
} else {
/* Do nothing: spurious embedding-pop */
}
#ifndef REMOVE_FORMATTING_CHARS
ctx->levels[i] = stk->level;
#endif
break;
}
case B: {
/* Rule X8: if an explicit paragraph separator appears in
* this text at all then it does not participate in any of
* the above, and just gets assigned the paragraph level.
*
* PS, it had better be right at the end of the text,
* because we have not implemented rule P1 in this code. */
assert(i == ctx->textlen - 1);
ctx->levels[i] = ctx->paragraphLevel;
break;
}
case BN: {
/*
* The section 5.2 adjustment to rule X6 says that we
* apply it to BN just like any other class. But I think
* this can't possibly give the same results as the
* unmodified algorithm.
*
* Proof: adding RLO BN or LRO BN at the end of a
* paragraph should not change the output of the standard
* algorithm, because the override doesn't affect the BN
* in rule X6, and then rule X9 removes both. But with the
* modified rule X6, the BN is changed into R or L, and
* then rule X9 doesn't remove it, and then you've added a
* strong type that will set eos for the level run just
* before the override. And whatever the standard
* algorithm set eos to, _one_ of these override sequences
* will disagree with it.
*
* So I think we just set the BN's level, and don't change
* its type.
*/
ctx->levels[i] = stk->level;
break;
}
default: {
/* Rule X6. */
ctx->levels[i] = stk->level;
if (stk->override != DO_NEUTRAL)
ctx->types[i] = (stk->override == DO_LTR ? L : R);
break;
}
}
}
#undef stk
}
static void remove_embedding_characters(BidiContext *ctx)
{
#ifndef REMOVE_FORMATTING_CHARS
/*
* Rule X9, as modified by section 5.2: turn embedding (but not
* isolate) characters into BN.
*/
for (size_t i = 0; i < ctx->textlen; i++) {
BidiType t = ctx->types[i];
if (typeIsRemovedDuringProcessing(t)) {
ctx->types[i] = BN;
/*
* My own adjustment to the section 5.2 mods: a sequence
* of contiguous BN generated by this setup should never
* be at different levels from each other.
*
* An example where this goes wrong is if you open two
* LREs in sequence, then close them again:
*
* ... LRE LRE PDF PDF ...
*
* The initial level assignment gives level 0 to the outer
* LRE/PDF pair, and level 2 to the inner one. The
* standard algorithm would remove all four, so this
* doesn't matter, and you end up with no break in the
* surrounding level run. But if you just rewrite the
* types of all those characters to BN and leave the
* levels in that state, then the modified algorithm will
* leave the middle two BN at level 2, dividing what
* should have been a long level run at level 0 into two
* separate ones.
*/
if (i > 0 && ctx->types[i-1] == BN)
ctx->levels[i] = ctx->levels[i-1];
}
}
#else
/*
* Rule X9, original version: completely remove embedding
* start/end characters and also boundary neutrals.
*/
size_t outpos = 0;
for (size_t i = 0; i < ctx->textlen; i++) {
BidiType t = ctx->types[i];
if (!typeIsRemovedDuringProcessing(t)) {
ctx->text[outpos] = ctx->text[i];
ctx->levels[outpos] = ctx->levels[i];
ctx->types[outpos] = ctx->types[i];
ctx->origTypes[outpos] = ctx->origTypes[i];
outpos++;
}
}
ctx->textlen = outpos;
#endif
}
typedef void (*irs_fn_t)(BidiContext *ctx);
static void find_isolating_run_sequences(BidiContext *ctx, irs_fn_t process)
{
/*
* Rule X10 / BD13. Now that we've assigned an embedding level to
* each character in the text, we have to divide the text into
* subsequences on which to do the next stage of processing.
*
* In earlier issues of the bidi algorithm, these subsequences
* were contiguous in the original text, and each one was a 'level
* run': a maximal contiguous subsequence of characters all at the
* same embedding level.
*
* But now we have isolates, and the point of an (isolate
* initiator ... PDI) sequence is that the whole sequence should
* be treated like a single BN for the purposes of formatting
* everything outside it. As a result, we now have to recombine
* our level runs into longer sequences, on the principle that if
* a level run ends with an isolate initiator, then we bring it