Skip to content

Commit

Permalink
Add splitAt operation
Browse files Browse the repository at this point in the history
  • Loading branch information
hvr committed Jan 18, 2018
1 parent d2c4242 commit 7e8ef1d
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 26 deletions.
42 changes: 42 additions & 0 deletions cbits/cbits.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,48 @@ hs_text_short_length(const uint8_t buf[], const size_t n)
return l;
}

/* Locate offset of j-th code-point in well-formed utf8 string
*
*/
size_t
hs_text_short_index_ofs(const uint8_t buf[], const size_t n, const size_t i)
{
size_t m = 0;
size_t j = 0;

for (;;) {
if (j >= i)
return m;

const size_t rest = n-m;

if (rest < (i-j))
return n;

const uint8_t b0 = buf[m];

if (!(b0 & 0x80))
m += 1; /* 0_______ */
else
switch(b0 >> 4) {
case 0xf: /* 11110___ */
m += 4;
break;
case 0xe: /* 1110____ */
m += 3;
break;
default: /* 110_____ */
m += 2;
break;
}

j += 1;
}

assert(0);
}


/* Validate UTF8 encoding
7 bits | 0xxxxxxx
Expand Down
26 changes: 15 additions & 11 deletions src-test/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@

module Main(main) where

import qualified Data.Text.Short as IUT
import Test.Tasty
import Test.Tasty.QuickCheck as QC
import Test.Tasty.HUnit
import Test.QuickCheck.Instances ()
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.String as D.S
import Data.Binary
import Data.Char
import Data.Monoid
import Data.Binary
import Data.Char
import Data.Maybe
import Data.Monoid
import qualified Data.String as D.S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Short as IUT
import Test.QuickCheck.Instances ()
import Test.Tasty
import Test.Tasty.HUnit
import Test.Tasty.QuickCheck as QC

fromByteStringRef = either (const Nothing) (Just . IUT.fromText) . T.decodeUtf8'

Expand All @@ -32,6 +33,9 @@ qcProps = testGroup "Properties"
, QC.testProperty "toString.fromString" $ \s -> (IUT.toString . IUT.fromString) s == s
, QC.testProperty "isAscii" $ \s -> IUT.isAscii (IUT.fromString s) == all isAscii s
, QC.testProperty "isAscii2" $ \t -> IUT.isAscii (IUT.fromText t) == T.all isAscii t
, QC.testProperty "splitAt" $ \t -> let t' = IUT.fromText t
mapBoth f (x,y) = (f x, f y)
in and [ mapBoth IUT.toText (IUT.splitAt i t') == T.splitAt i t | i <- [-5 .. 5+T.length t ] ]
]

unitTests = testGroup "Unit-tests"
Expand Down
5 changes: 3 additions & 2 deletions src/Data/Text/Short.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ module Data.Text.Short
, null
, length
, isAscii
, splitAt

-- * Conversions
-- ** 'String'
Expand All @@ -37,5 +38,5 @@ module Data.Text.Short

) where

import Data.Text.Short.Internal
import Prelude ()
import Data.Text.Short.Internal
import Prelude ()
105 changes: 92 additions & 13 deletions src/Data/Text/Short/Internal.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
{-# LANGUAGE CPP, GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes, Unsafe #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE UnboxedTuples #-}
{-# LANGUAGE UnliftedFFITypes #-}
{-# LANGUAGE Unsafe #-}

-- |
-- Module : Data.Text.Short.Internal
Expand All @@ -17,6 +23,7 @@ module Data.Text.Short.Internal
, Data.Text.Short.Internal.null
, Data.Text.Short.Internal.length
, Data.Text.Short.Internal.isAscii
, Data.Text.Short.Internal.splitAt

-- * Conversions
-- ** 'String'
Expand All @@ -40,25 +47,29 @@ module Data.Text.Short.Internal

) where

import Control.DeepSeq (NFData)
import Control.DeepSeq (NFData)
-- import Control.Exception as E
import qualified Data.ByteString as BS
import Data.ByteString.Short (ShortByteString)
import qualified Data.ByteString.Short as BSS
import Data.Binary
import qualified Data.ByteString as BS
import qualified Data.ByteString.Builder as BB
import Data.ByteString.Short (ShortByteString)
import qualified Data.ByteString.Short as BSS
import qualified Data.ByteString.Short.Internal as BSSI
import Data.Char
import Data.Hashable (Hashable)
import Data.Hashable (Hashable)
import Data.Semigroup
import qualified Data.String as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.String as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Foreign.C
import GHC.Exts (ByteArray#)
import qualified GHC.Foreign as GHC
import GHC.Exts (ByteArray#, Int (I#),
MutableByteArray#,
copyByteArray#, newByteArray#,
unsafeFreezeByteArray#)
import qualified GHC.Foreign as GHC
import GHC.IO.Encoding
import GHC.ST
import System.IO.Unsafe
import Data.Binary
import qualified Data.ByteString.Builder as BB

-- | A compact representation of Unicode strings.
--
Expand Down Expand Up @@ -240,6 +251,74 @@ isValidUtf8 st = (==0) $ unsafePerformIO (c_text_short_is_valid_utf8 (toByteArra

foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt

-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
--
-- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties:
--
-- prop> length (fst (split n t)) == min (length t) (max 0 n)
--
-- prop> fst (split n t) <> snd (split n t) == t
--
-- @since TBD
splitAt :: Int -> ShortText -> (ShortText,ShortText)
splitAt i st
| i <= 0 = (mempty,st)
| len2 <= 0 = (st,mempty)
| otherwise = (slice st 0 ofs, slice st ofs len2)
where
ofs = unsafePerformIO (c_text_short_index_ofs (toByteArray# st) stsz (fromIntegral i))
stsz = toCSize st
len2 = stsz-ofs

foreign import ccall unsafe "hs_text_short_index_ofs" c_text_short_index_ofs :: ByteArray# -> CSize -> CSize -> IO CSize

----------------------------------------------------------------------------

-- | Construct a new 'ShortText' from an existing one by slicing
--
-- NB: The 'CSize' arguments refer to byte-offsets
slice :: ShortText -> CSize -> CSize -> ShortText
slice (ShortText x) ofs_ len_ = ShortText (sliceSBS x (fromIntegral ofs_) (fromIntegral len_))
where
sliceSBS :: ShortByteString -> Int -> Int -> ShortByteString
sliceSBS sbs@(BSSI.SBS ba#) ofs len
| ofs < 0 = error "invalid offset"
| len < 0 = error "invalid length"
| len' == 0 = mempty
| otherwise = createSBS len' go
where
len0 = BSS.length sbs
len' = max 0 (min len (len0-ofs))
ofs' = max 0 ofs

go :: MBA s -> ST s ()
go (MBA# mba#) = ST $ \s -> case copyByteArray# ba# (toI ofs') mba# 0# (toI len') s of
s' -> (# s', () #)

toI (I# i#) = i#

----------------------------------------------------------------------------
-- low-level MutableByteArray# helpers

data MBA s = MBA# (MutableByteArray# s)

createSBS :: Int -> (forall s. MBA s -> ST s ()) -> ShortByteString
createSBS n go = runST $ do
mba <- newByteArray n
go mba
unsafeFreezeSBS mba

unsafeFreezeSBS :: MBA s -> ST s ShortByteString
unsafeFreezeSBS (MBA# mba#)
= ST $ \s -> case unsafeFreezeByteArray# mba# s of
(# s', ba# #) -> (# s', BSSI.SBS ba# #)

newByteArray :: Int -> ST s (MBA s)
newByteArray (I# n#)
= ST $ \s -> case newByteArray# n# s of
(# s', mba# #) -> (# s', MBA# mba# #)


{- TODO:
{-# RULES "ShortText strlit" forall s . fromString (unpackCString# s) = fromAddr# #-}
...
Expand Down

0 comments on commit 7e8ef1d

Please sign in to comment.