From ad6a090f9b828bddb3b657e86fb131654addb546 Mon Sep 17 00:00:00 2001 From: Haozhun Jin Date: Fri, 20 Mar 2015 12:01:05 -0700 Subject: [PATCH] bugfix: char class casefold for certain chars When a character is less than or equal to single byte size (0xff), yet it takes more than 1 byte in the current encoding, the case folding code incorrectly put it in bitset instead of code range. As a result, for utf8 encoding, casefold works incorrectly on characters in range \u0080 to \u00ff (latin1 supplement). Before fix: * `"\u00c2"` `[\u00e0-\u00e5]` returns false * `"\u00c2"` `[\u00e2]` returns false * `"\u00c2"` `\u00e2` returns true --- src/org/joni/ApplyCaseFold.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/org/joni/ApplyCaseFold.java b/src/org/joni/ApplyCaseFold.java index 7dd84ce1..6a8d1c35 100644 --- a/src/org/joni/ApplyCaseFold.java +++ b/src/org/joni/ApplyCaseFold.java @@ -41,7 +41,7 @@ public void apply(int from, int[]to, int length, Object o) { if (Config.CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS) { if ((inCC && !cc.isNot()) || (!inCC && cc.isNot())) { - if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE) { + if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE || enc.codeToMbcLength(to[0]) > 1) { cc.addCodeRange(env, to[0], to[0]); } else { /* /(?i:[^A-C])/.match("a") ==> fail. */