diff --git a/cbits/cbits.c b/cbits/cbits.c index cf7c85c..b1acc59 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -71,6 +71,9 @@ const static bool is_bigendian = false; /* test whether octet in UTF-8 steam is not a continuation byte, i.e. a leading byte */ #define utf8_lead_p(octet) (((octet) & 0xc0) != 0x80) +/* 0 <= x <= 0x110000 */ +typedef HsWord codepoint_t; + /* Count number of code-points in well-formed utf8 string */ size_t hs_text_short_length(const uint8_t buf[], const size_t n) @@ -246,14 +249,14 @@ hs_text_short_decode_cp(const uint8_t buf[]) } /* decode codepoint starting at buf[ofs] */ -uint32_t +codepoint_t hs_text_short_ofs_cp(const uint8_t buf[], const size_t ofs) { return hs_text_short_decode_cp(buf+ofs); } /* reverse-decode codepoint starting at offset right after a code-point */ -uint32_t +codepoint_t hs_text_short_ofs_cp_rev(const uint8_t *buf, const size_t ofs) { /* 7 bits | 0xxxxxxx @@ -314,30 +317,30 @@ hs_text_short_ofs_cp_rev(const uint8_t *buf, const size_t ofs) /* Retrieve i-th code-point in (valid) UTF8 stream * - * Returns 0xFFFFFFFF if out of bounds + * Returns -1 if out of bounds */ -uint32_t +codepoint_t hs_text_short_index_cp(const uint8_t buf[], const size_t n, const size_t i) { const size_t ofs = hs_text_short_index_ofs(buf, n, i); if (ofs >= n) - return UINT32_C(0xffffffff); + return -1; return hs_text_short_decode_cp(&buf[ofs]); } /* Retrieve i-th code-point in (valid) UTF8 stream * - * Returns 0xFFFFFFFF if out of bounds + * Returns -1 if out of bounds */ -uint32_t +codepoint_t hs_text_short_index_cp_rev(const uint8_t buf[], const size_t n, const size_t i) { const size_t ofs = hs_text_short_index_ofs_rev(buf, n, i); if (ofs >= n) - return UINT32_C(0xffffffff); + return -1; return hs_text_short_decode_cp(&buf[ofs]); } diff --git a/src/Data/Text/Short/Internal.hs b/src/Data/Text/Short/Internal.hs index 6de15a5..b8852dc 100644 --- a/src/Data/Text/Short/Internal.hs +++ b/src/Data/Text/Short/Internal.hs @@ -699,9 +699,11 @@ encodeStringShort te = BSS.toShort . encodeString te isValidUtf8 :: ShortText -> Bool isValidUtf8 st = (==0) $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st)) +type CCodePoint = Word + foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt -foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO Word32 +foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO CCodePoint -- | \(\mathcal{O}(n)\) Lookup /i/-th code-point in 'ShortText'. -- @@ -719,8 +721,7 @@ indexMaybe st i | i < 0 = Nothing | otherwise = cp2chSafe cp where - cp = CP $ fromIntegral $ - unsafeDupablePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i)) + cp = CP $ unsafeDupablePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i)) -- | \(\mathcal{O}(n)\) Lookup /i/-th code-point from the end of 'ShortText'. -- @@ -738,10 +739,9 @@ indexEndMaybe st i | i < 0 = Nothing | otherwise = cp2chSafe cp where - cp = CP $ fromIntegral $ - unsafeDupablePerformIO (c_text_short_index_rev (toByteArray# st) (toCSize st) (fromIntegral i)) + cp = CP $ unsafeDupablePerformIO (c_text_short_index_rev (toByteArray# st) (toCSize st) (fromIntegral i)) -foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev :: ByteArray# -> CSize -> CSize -> IO Word32 +foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev :: ByteArray# -> CSize -> CSize -> IO CCodePoint -- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves. @@ -1437,15 +1437,15 @@ writeRepChar mba ofs = do -- beware: UNSAFE! readCodePoint :: ShortText -> B -> CP readCodePoint st (csizeFromB -> ofs) - = CP $ fromIntegral $ unsafeDupablePerformIO (c_text_short_ofs_cp (toByteArray# st) ofs) + = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp (toByteArray# st) ofs) -foreign import ccall unsafe "hs_text_short_ofs_cp" c_text_short_ofs_cp :: ByteArray# -> CSize -> IO Word32 +foreign import ccall unsafe "hs_text_short_ofs_cp" c_text_short_ofs_cp :: ByteArray# -> CSize -> IO CCodePoint readCodePointRev :: ShortText -> B -> CP readCodePointRev st (csizeFromB -> ofs) - = CP $ fromIntegral $ unsafeDupablePerformIO (c_text_short_ofs_cp_rev (toByteArray# st) ofs) + = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp_rev (toByteArray# st) ofs) -foreign import ccall unsafe "hs_text_short_ofs_cp_rev" c_text_short_ofs_cp_rev :: ByteArray# -> CSize -> IO Word32 +foreign import ccall unsafe "hs_text_short_ofs_cp_rev" c_text_short_ofs_cp_rev :: ByteArray# -> CSize -> IO CCodePoint ---------------------------------------------------------------------------- -- string & list literals