From 93b7dffb2f521288169117ab2cd6caff7946e96a Mon Sep 17 00:00:00 2001
From: Manish Goregaokar <manishsmail@gmail.com>
Date: Tue, 22 Oct 2019 17:16:56 -0700
Subject: [PATCH] Add WSegSpace support for in word boundaries from Unicode 11

---
 src/lib.rs  |  4 ++--
 src/word.rs | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 78a0b6f..fce3c52 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,7 +29,7 @@
 //!
 //!     let s = "The quick (\"brown\")  fox";
 //!     let w = s.split_word_bounds().collect::<Vec<&str>>();
-//!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
+//!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
 //!     assert_eq!(w, b);
 //! }
 //! ```
@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
     /// ```
     /// # use self::unicode_segmentation::UnicodeSegmentation;
     /// let swu1 = "The quick (\"brown\")  fox".split_word_bounds().collect::<Vec<&str>>();
-    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
+    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
     ///
     /// assert_eq!(&swu1[..], b);
     /// ```
diff --git a/src/word.rs b/src/word.rs
index f2b839a..6e9c049 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -102,6 +102,7 @@ enum UWordBoundsState {
     FormatExtend(FormatExtendType),
     Zwj,
     Emoji,
+    WSegSpace,
 }
 
 // subtypes for FormatExtend state in UWordBoundsState
@@ -156,6 +157,8 @@ impl<'a> Iterator for UWordBounds<'a> {
         // Whether or not the previous category was ZWJ
         // ZWJs get collapsed, so this handles precedence of WB3c over WB4
         let mut prev_zwj;
+        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
+        let mut skipped_format_extend = false;
         for (curr, ch) in self.string.char_indices() {
             idx = curr;
             prev_zwj = cat == wd::WC_ZWJ;
@@ -177,6 +180,7 @@ impl<'a> Iterator for UWordBounds<'a> {
             if state != Start {
                 match cat {
                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
+                        skipped_format_extend = true;
                         continue
                     }
                     _ => {}
@@ -219,6 +223,7 @@ impl<'a> Iterator for UWordBounds<'a> {
                     wd::WC_Regional_Indicator => Regional(RegionalState::Half),  // rule WB13c
                     wd::WC_LF | wd::WC_Newline => break,    // rule WB3a
                     wd::WC_ZWJ => Zwj,                      // rule WB3c
+                    wd::WC_WSegSpace => WSegSpace,          // rule WB3d
                     _ => {
                         if let Some(ncat) = self.get_next_cat(idx) {                // rule WB4
                             if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
@@ -230,6 +235,13 @@ impl<'a> Iterator for UWordBounds<'a> {
                         break;                                                      // rule WB999
                     }
                 },
+                WSegSpace => match cat {
+                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
                 Zwj => {
                     // We already handle WB3c above.
                     take_curr = false;
@@ -371,6 +383,8 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
         let mut savestate = Start;
         let mut cat = wd::WC_Any;
 
+        let mut skipped_format_extend = false;
+
         for (curr, ch) in self.string.char_indices().rev() {
             previdx = idx;
             idx = curr;
@@ -409,6 +423,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
                 state = savestate;
                 previdx = saveidx;
                 take_cat = false;
+                skipped_format_extend = true;
             }
 
             // Don't use `continue` in this match without updating `catb`
@@ -427,6 +442,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
                         saveidx = idx;
                         FormatExtend(AcceptQLetter)                         // rule WB7a
                     },
+                    wd::WC_WSegSpace => WSegSpace,
                     wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
                         if state == Start {
                             if cat == wd::WC_LF {
@@ -451,6 +467,15 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
                         break;
                     }
                 },
+                WSegSpace => match cat {                          // rule WB3d
+                    wd::WC_WSegSpace if !skipped_format_extend => {
+                        WSegSpace
+                    }
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
                 Letter | HLetter => match cat {
                     wd::WC_ALetter => Letter,               // rule WB5
                     wd::WC_Hebrew_Letter => HLetter,        // rule WB5