Skip to content

Commit

Permalink
Implement correct handling of wide Unicode characters
Browse files Browse the repository at this point in the history
  • Loading branch information
mrkkrp committed Jul 11, 2024
1 parent 739cdcb commit 1160ac8
Show file tree
Hide file tree
Showing 8 changed files with 278 additions and 13 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
*Megaparsec follows [SemVer](https://semver.org/).*

## Upcoming

* Implemented correct handling of wide Unicode characters in error messages.
To that end, a new module `Text.Megaparsec.Unicode` was introduced. [Issue
370](https://github.com/mrkkrp/megaparsec/issues/370).

## Megaparsec 9.6.1

* Exposed `Text.Megaparsec.State`, so that the new functions (`initialState`
Expand Down
3 changes: 2 additions & 1 deletion Text/Megaparsec/Error.hs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ import GHC.Generics
import Text.Megaparsec.Pos
import Text.Megaparsec.State
import Text.Megaparsec.Stream
import qualified Text.Megaparsec.Unicode as Unicode

----------------------------------------------------------------------------
-- Parse error type
Expand Down Expand Up @@ -397,7 +398,7 @@ errorBundlePretty ParseErrorBundle {..} =
lineNumber = (show . unPos . sourceLine) epos
padding = replicate (length lineNumber + 1) ' '
rpshift = unPos (sourceColumn epos) - 1
slineLen = length sline
slineLen = Unicode.stringLength sline
in padding
<> "|\n"
<> lineNumber
Expand Down
44 changes: 32 additions & 12 deletions Text/Megaparsec/Stream.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ import qualified Data.Text.Lazy as TL
import Data.Word (Word8)
import Text.Megaparsec.Pos
import Text.Megaparsec.State
import qualified Text.Megaparsec.Unicode as Unicode

-- | Type class for inputs that can be consumed by the library.
--
Expand Down Expand Up @@ -426,6 +427,7 @@ class (Stream s) => VisualStream s where

instance VisualStream String where
showTokens Proxy = stringPretty
tokensLength Proxy = Unicode.stringLength

instance VisualStream B.ByteString where
showTokens Proxy = stringPretty . fmap (chr . fromIntegral)
Expand All @@ -435,9 +437,11 @@ instance VisualStream BL.ByteString where

instance VisualStream T.Text where
showTokens Proxy = stringPretty
tokensLength Proxy = Unicode.stringLength

instance VisualStream TL.Text where
showTokens Proxy = stringPretty
tokensLength Proxy = Unicode.stringLength

-- | Type class for inputs that can also be used for error reporting.
--
Expand Down Expand Up @@ -510,37 +514,37 @@ class (Stream s) => TraversableStream s where
instance TraversableStream String where
-- NOTE Do not eta-reduce these (breaks inlining)
reachOffset o pst =
reachOffset' splitAt foldl' id id ('\n', '\t') o pst
reachOffset' splitAt foldl' id id ('\n', '\t') charInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst
reachOffsetNoLine' splitAt foldl' ('\n', '\t') charInc o pst

instance TraversableStream B.ByteString where
-- NOTE Do not eta-reduce these (breaks inlining)
reachOffset o pst =
reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst
reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) byteInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst
reachOffsetNoLine' B.splitAt B.foldl' (10, 9) byteInc o pst

instance TraversableStream BL.ByteString where
-- NOTE Do not eta-reduce these (breaks inlining)
reachOffset o pst =
reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst
reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) byteInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst
reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) byteInc o pst

instance TraversableStream T.Text where
-- NOTE Do not eta-reduce (breaks inlining of reachOffset').
reachOffset o pst =
reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst
reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') charInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst
reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') charInc o pst

instance TraversableStream TL.Text where
-- NOTE Do not eta-reduce (breaks inlining of reachOffset').
reachOffset o pst =
reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst
reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') charInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst
reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') charInc o pst

----------------------------------------------------------------------------
-- Helpers
Expand All @@ -564,6 +568,8 @@ reachOffset' ::
(Token s -> Char) ->
-- | Newline token and tab token
(Token s, Token s) ->
-- | Increment in column position for a token
(Token s -> Pos) ->
-- | Offset to reach
Int ->
-- | Initial 'PosState' to use
Expand All @@ -576,6 +582,7 @@ reachOffset'
fromToks
fromTok
(newlineTok, tabTok)
columnIncrement
o
PosState {..} =
( Just $ case expandTab pstateTabWidth
Expand Down Expand Up @@ -624,7 +631,7 @@ reachOffset'
(g . (fromTok ch :))
| otherwise ->
St
(SourcePos n l (c <> pos1))
(SourcePos n l (c <> columnIncrement ch))
(g . (fromTok ch :))
{-# INLINE reachOffset' #-}

Expand All @@ -639,6 +646,8 @@ reachOffsetNoLine' ::
-- | Newline token and tab token
(Token s, Token s) ->
-- | Offset to reach
-- | Increment in column position for a token
(Token s -> Pos) ->
Int ->
-- | Initial 'PosState' to use
PosState s ->
Expand All @@ -648,6 +657,7 @@ reachOffsetNoLine'
splitAt'
foldl''
(newlineTok, tabTok)
columnIncrement
o
PosState {..} =
( PosState
Expand All @@ -670,7 +680,7 @@ reachOffsetNoLine'
| ch == tabTok ->
SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))
| otherwise ->
SourcePos n l (c <> pos1)
SourcePos n l (c <> columnIncrement ch)
{-# INLINE reachOffsetNoLine' #-}

-- | Like 'BL.splitAt' but accepts the index as an 'Int'.
Expand Down Expand Up @@ -753,3 +763,13 @@ expandTab w' = go 0 0
go !i 0 (x : xs) = x : go (i + 1) 0 xs
go !i n xs = ' ' : go (i + 1) (n - 1) xs
w = unPos w'

-- | Return increment in column position that corresponds to the given
-- 'Char'.
charInc :: Char -> Pos
charInc ch = if Unicode.isWideChar ch then pos1 <> pos1 else pos1

-- | Return increment in column position that corresponds to the given
-- 'Word8'.
byteInc :: Word8 -> Pos
byteInc _ = pos1
180 changes: 180 additions & 0 deletions Text/Megaparsec/Unicode.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
{-# LANGUAGE Safe #-}

-- |
-- Module : Text.Megaparsec.Unicode
-- Copyright : © 2024–present Megaparsec contributors
-- License : FreeBSD
--
-- Maintainer : Mark Karpov <[email protected]>
-- Stability : experimental
-- Portability : portable
--
-- Utility functions for working with Unicode.
--
-- @since 9.7.0
module Text.Megaparsec.Unicode
( stringLength,
charLength,
isWideChar,
)
where

import Data.Array (Array, bounds, listArray, (!))
import Data.Char (ord)

-- | Calculate length of a string taking into account the fact that certain
-- 'Char's may span more than 1 column.
--
-- @since 9.7.0
stringLength :: (Traversable t) => t Char -> Int
stringLength = sum . fmap charLength

-- | Return length of an individual 'Char'.
--
-- @since 9.7.0
charLength :: Char -> Int
charLength ch = if isWideChar ch then 2 else 1

-- | Determine whether the given 'Char' is “wide”, that is, whether it spans
-- 2 columns instead of one.
--
-- @since 9.7.0
isWideChar :: Char -> Bool
isWideChar c = go (bounds wideCharRanges)
where
go (lo, hi)
| hi < lo = False
| a <= n && n <= b = True
| n < a = go (lo, pred mid)
| otherwise = go (succ mid, hi)
where
mid = (lo + hi) `div` 2
(a, b) = wideCharRanges ! mid
n = ord c

-- | Wide character ranges.
wideCharRanges :: Array Int (Int, Int)
wideCharRanges =
listArray
(0, 118)
[ (0x001100, 0x00115f),
(0x00231a, 0x00231b),
(0x002329, 0x00232a),
(0x0023e9, 0x0023ec),
(0x0023f0, 0x0023f0),
(0x0023f3, 0x0023f3),
(0x0025fd, 0x0025fe),
(0x002614, 0x002615),
(0x002648, 0x002653),
(0x00267f, 0x00267f),
(0x002693, 0x002693),
(0x0026a1, 0x0026a1),
(0x0026aa, 0x0026ab),
(0x0026bd, 0x0026be),
(0x0026c4, 0x0026c5),
(0x0026ce, 0x0026ce),
(0x0026d4, 0x0026d4),
(0x0026ea, 0x0026ea),
(0x0026f2, 0x0026f3),
(0x0026f5, 0x0026f5),
(0x0026fa, 0x0026fa),
(0x0026fd, 0x0026fd),
(0x002705, 0x002705),
(0x00270a, 0x00270b),
(0x002728, 0x002728),
(0x00274c, 0x00274c),
(0x00274e, 0x00274e),
(0x002753, 0x002755),
(0x002757, 0x002757),
(0x002795, 0x002797),
(0x0027b0, 0x0027b0),
(0x0027bf, 0x0027bf),
(0x002b1b, 0x002b1c),
(0x002b50, 0x002b50),
(0x002b55, 0x002b55),
(0x002e80, 0x002e99),
(0x002e9b, 0x002ef3),
(0x002f00, 0x002fd5),
(0x002ff0, 0x002ffb),
(0x003000, 0x00303e),
(0x003041, 0x003096),
(0x003099, 0x0030ff),
(0x003105, 0x00312f),
(0x003131, 0x00318e),
(0x003190, 0x0031ba),
(0x0031c0, 0x0031e3),
(0x0031f0, 0x00321e),
(0x003220, 0x003247),
(0x003250, 0x004db5),
(0x004e00, 0x009fef),
(0x00a000, 0x00a48c),
(0x00a490, 0x00a4c6),
(0x00a960, 0x00a97c),
(0x00ac00, 0x00d7a3),
(0x00f900, 0x00fa6d),
(0x00fa70, 0x00fad9),
(0x00fe10, 0x00fe19),
(0x00fe30, 0x00fe52),
(0x00fe54, 0x00fe66),
(0x00fe68, 0x00fe6b),
(0x00ff01, 0x00ff60),
(0x00ffe0, 0x00ffe6),
(0x016fe0, 0x016fe3),
(0x017000, 0x0187f7),
(0x018800, 0x018af2),
(0x01b000, 0x01b11e),
(0x01b150, 0x01b152),
(0x01b164, 0x01b167),
(0x01b170, 0x01b2fb),
(0x01f004, 0x01f004),
(0x01f0cf, 0x01f0cf),
(0x01f18e, 0x01f18e),
(0x01f191, 0x01f19a),
(0x01f200, 0x01f202),
(0x01f210, 0x01f23b),
(0x01f240, 0x01f248),
(0x01f250, 0x01f251),
(0x01f260, 0x01f265),
(0x01f300, 0x01f320),
(0x01f32d, 0x01f335),
(0x01f337, 0x01f37c),
(0x01f37e, 0x01f393),
(0x01f3a0, 0x01f3ca),
(0x01f3cf, 0x01f3d3),
(0x01f3e0, 0x01f3f0),
(0x01f3f4, 0x01f3f4),
(0x01f3f8, 0x01f43e),
(0x01f440, 0x01f440),
(0x01f442, 0x01f4fc),
(0x01f4ff, 0x01f53d),
(0x01f54b, 0x01f54e),
(0x01f550, 0x01f567),
(0x01f57a, 0x01f57a),
(0x01f595, 0x01f596),
(0x01f5a4, 0x01f5a4),
(0x01f5fb, 0x01f64f),
(0x01f680, 0x01f6c5),
(0x01f6cc, 0x01f6cc),
(0x01f6d0, 0x01f6d2),
(0x01f6d5, 0x01f6d5),
(0x01f6eb, 0x01f6ec),
(0x01f6f4, 0x01f6fa),
(0x01f7e0, 0x01f7eb),
(0x01f90d, 0x01f971),
(0x01f973, 0x01f976),
(0x01f97a, 0x01f9a2),
(0x01f9a5, 0x01f9aa),
(0x01f9ae, 0x01f9ca),
(0x01f9cd, 0x01f9ff),
(0x01fa70, 0x01fa73),
(0x01fa78, 0x01fa7a),
(0x01fa80, 0x01fa82),
(0x01fa90, 0x01fa95),
(0x020000, 0x02a6d6),
(0x02a700, 0x02b734),
(0x02b740, 0x02b81d),
(0x02b820, 0x02cea1),
(0x02ceb0, 0x02ebe0),
(0x02f800, 0x02fa1d)
]
{-# NOINLINE wideCharRanges #-}
1 change: 1 addition & 0 deletions megaparsec-tests/megaparsec-tests.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ test-suite tests
Text.Megaparsec.ErrorSpec
Text.Megaparsec.PosSpec
Text.Megaparsec.StreamSpec
Text.Megaparsec.UnicodeSpec
Text.MegaparsecSpec

default-language: Haskell2010
Expand Down
Loading

0 comments on commit 1160ac8

Please sign in to comment.