Skip to content

Commit

Permalink
Implement (!?) operator
Browse files Browse the repository at this point in the history
  • Loading branch information
hvr committed Jan 18, 2018
1 parent 7e8ef1d commit 230437b
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 0 deletions.
55 changes: 55 additions & 0 deletions cbits/cbits.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,61 @@ hs_text_short_index_ofs(const uint8_t buf[], const size_t n, const size_t i)
assert(0);
}

/* Decode UTF8 code units into code-point
* Assumes buf[] points to start of a valid UTF8-encoded code-point
*/
static inline uint32_t
hs_text_short_decode_cp(const uint8_t buf[])
{
/* 7 bits | 0xxxxxxx
* 11 bits | 110yyyyx 10xxxxxx
* 16 bits | 1110yyyy 10yxxxxx 10xxxxxx
* 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
*/

const uint8_t b0 = buf[0];

if (!(b0 & 0x80))
return b0;

uint32_t cp = 0;

switch(b0 >> 4) {
case 0xf: /* 11110___ */
cp = ((uint32_t)(b0 & 0x07)) << (6+6+6);
cp |= ((uint32_t)(buf[1] & 0x3f)) << (6+6);
cp |= ((uint32_t)(buf[2] & 0x3f)) << 6;
cp |= buf[3] & 0x3f;
return cp;

case 0xe: /* 1110____ */
cp = ((uint32_t)(b0 & 0x0f)) << (6+6);
cp |= ((uint32_t)(buf[1] & 0x3f)) << 6;
cp |= buf[2] & 0x3f;
return cp;

default: /* 110_____ */
cp = ((uint32_t)(b0 & 0x1f)) << 6;
cp |= buf[1] & 0x3f;
return cp;
}
}

/* Retrieve i-th code-point in (valid) UTF8 stream
*
* Returns 0xFFFFFFFF if out of bounds
*/
uint32_t
hs_text_short_index_cp(const uint8_t buf[], const size_t n, const size_t i)
{
const size_t ofs = hs_text_short_index_ofs(buf, n, i);

if (ofs >= n)
return UINT32_C(0xffffffff);

return hs_text_short_decode_cp(&buf[ofs]);
}


/* Validate UTF8 encoding
Expand Down
2 changes: 2 additions & 0 deletions src-test/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ qcProps :: TestTree
qcProps = testGroup "Properties"
[ QC.testProperty "length/fromText" $ \t -> IUT.length (IUT.fromText t) == T.length t
, QC.testProperty "length/fromString" $ \s -> IUT.length (IUT.fromString s) == length s
, QC.testProperty "(!?)" $ \t -> let t' = IUT.fromText t
in mapMaybe (t' IUT.!?) [-5 .. 5+T.length t ] == T.unpack t
, QC.testProperty "toText.fromText" $ \t -> (IUT.toText . IUT.fromText) t == t
, QC.testProperty "fromByteString" $ \b -> IUT.fromByteString b == fromByteStringRef b
, QC.testProperty "fromByteString.toByteString" $ \t -> let ts = IUT.fromText t in (IUT.fromByteString . IUT.toByteString) ts == Just ts
Expand Down
1 change: 1 addition & 0 deletions src/Data/Text/Short.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ module Data.Text.Short
, null
, length
, isAscii
, (!?)
, splitAt

-- * Conversions
Expand Down
16 changes: 16 additions & 0 deletions src/Data/Text/Short/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ module Data.Text.Short.Internal
, Data.Text.Short.Internal.length
, Data.Text.Short.Internal.isAscii
, Data.Text.Short.Internal.splitAt
, (!?)

-- * Conversions
-- ** 'String'
Expand Down Expand Up @@ -251,6 +252,21 @@ isValidUtf8 st = (==0) $ unsafePerformIO (c_text_short_is_valid_utf8 (toByteArra

foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt

-- | \(\mathcal{O}(n)\) Index /i/-th code-point in 'ShortText'.
--
-- Returns 'Nothing' if out of bounds.
--
-- @since TBD
(!?) :: ShortText -> Int -> Maybe Char
(!?) st i
| i < 0 = Nothing
| cp < 0x110000 = Just (chr (fromIntegral cp))
| otherwise = Nothing
where
cp = unsafePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i))

foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO Word32

-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
--
-- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties:
Expand Down

0 comments on commit 230437b

Please sign in to comment.