From 230437b051eebba6a304476a5dd102324b382bc5 Mon Sep 17 00:00:00 2001 From: Herbert Valerio Riedel Date: Thu, 18 Jan 2018 19:52:39 +0100 Subject: [PATCH] Implement (!?) operator --- cbits/cbits.c | 55 +++++++++++++++++++++++++++++++++ src-test/Tests.hs | 2 ++ src/Data/Text/Short.hs | 1 + src/Data/Text/Short/Internal.hs | 16 ++++++++++ 4 files changed, 74 insertions(+) diff --git a/cbits/cbits.c b/cbits/cbits.c index f35d877..84700b7 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -111,6 +111,61 @@ hs_text_short_index_ofs(const uint8_t buf[], const size_t n, const size_t i) assert(0); } +/* Decode UTF8 code units into code-point + * Assumes buf[] points to start of a valid UTF8-encoded code-point + */ +static inline uint32_t +hs_text_short_decode_cp(const uint8_t buf[]) +{ + /* 7 bits | 0xxxxxxx + * 11 bits | 110yyyyx 10xxxxxx + * 16 bits | 1110yyyy 10yxxxxx 10xxxxxx + * 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx + */ + + const uint8_t b0 = buf[0]; + + if (!(b0 & 0x80)) + return b0; + + uint32_t cp = 0; + + switch(b0 >> 4) { + case 0xf: /* 11110___ */ + cp = ((uint32_t)(b0 & 0x07)) << (6+6+6); + cp |= ((uint32_t)(buf[1] & 0x3f)) << (6+6); + cp |= ((uint32_t)(buf[2] & 0x3f)) << 6; + cp |= buf[3] & 0x3f; + return cp; + + case 0xe: /* 1110____ */ + cp = ((uint32_t)(b0 & 0x0f)) << (6+6); + cp |= ((uint32_t)(buf[1] & 0x3f)) << 6; + cp |= buf[2] & 0x3f; + return cp; + + default: /* 110_____ */ + cp = ((uint32_t)(b0 & 0x1f)) << 6; + cp |= buf[1] & 0x3f; + return cp; + } +} + +/* Retrieve i-th code-point in (valid) UTF8 stream + * + * Returns 0xFFFFFFFF if out of bounds + */ +uint32_t +hs_text_short_index_cp(const uint8_t buf[], const size_t n, const size_t i) +{ + const size_t ofs = hs_text_short_index_ofs(buf, n, i); + + if (ofs >= n) + return UINT32_C(0xffffffff); + + return hs_text_short_decode_cp(&buf[ofs]); +} + /* Validate UTF8 encoding diff --git a/src-test/Tests.hs b/src-test/Tests.hs index 0c132ca..e43745a 100644 --- a/src-test/Tests.hs +++ b/src-test/Tests.hs @@ -27,6 +27,8 @@ qcProps :: TestTree qcProps = testGroup "Properties" [ QC.testProperty "length/fromText" $ \t -> IUT.length (IUT.fromText t) == T.length t , QC.testProperty "length/fromString" $ \s -> IUT.length (IUT.fromString s) == length s + , QC.testProperty "(!?)" $ \t -> let t' = IUT.fromText t + in mapMaybe (t' IUT.!?) [-5 .. 5+T.length t ] == T.unpack t , QC.testProperty "toText.fromText" $ \t -> (IUT.toText . IUT.fromText) t == t , QC.testProperty "fromByteString" $ \b -> IUT.fromByteString b == fromByteStringRef b , QC.testProperty "fromByteString.toByteString" $ \t -> let ts = IUT.fromText t in (IUT.fromByteString . IUT.toByteString) ts == Just ts diff --git a/src/Data/Text/Short.hs b/src/Data/Text/Short.hs index 1679ef8..fa304ff 100644 --- a/src/Data/Text/Short.hs +++ b/src/Data/Text/Short.hs @@ -16,6 +16,7 @@ module Data.Text.Short , null , length , isAscii + , (!?) , splitAt -- * Conversions diff --git a/src/Data/Text/Short/Internal.hs b/src/Data/Text/Short/Internal.hs index d293ed9..9d48cca 100644 --- a/src/Data/Text/Short/Internal.hs +++ b/src/Data/Text/Short/Internal.hs @@ -24,6 +24,7 @@ module Data.Text.Short.Internal , Data.Text.Short.Internal.length , Data.Text.Short.Internal.isAscii , Data.Text.Short.Internal.splitAt + , (!?) -- * Conversions -- ** 'String' @@ -251,6 +252,21 @@ isValidUtf8 st = (==0) $ unsafePerformIO (c_text_short_is_valid_utf8 (toByteArra foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt +-- | \(\mathcal{O}(n)\) Index /i/-th code-point in 'ShortText'. +-- +-- Returns 'Nothing' if out of bounds. +-- +-- @since TBD +(!?) :: ShortText -> Int -> Maybe Char +(!?) st i + | i < 0 = Nothing + | cp < 0x110000 = Just (chr (fromIntegral cp)) + | otherwise = Nothing + where + cp = unsafePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i)) + +foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO Word32 + -- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves. -- -- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties: