never executed always true always false

    1 {-
    2 (c) The University of Glasgow 2006
    3 (c) The University of Glasgow, 1997-2006
    4 
    5 
    6 Buffers for scanning string input stored in external arrays.
    7 -}
    8 
    9 {-# LANGUAGE BangPatterns #-}
   10 {-# LANGUAGE CPP #-}
   11 {-# LANGUAGE MagicHash #-}
   12 {-# LANGUAGE UnboxedTuples #-}
   13 
   14 {-# OPTIONS_GHC -O2 #-}
   15 -- We always optimise this, otherwise performance of a non-optimised
   16 -- compiler is severely affected
   17 
   18 module GHC.Data.StringBuffer
   19        (
   20         StringBuffer(..),
   21         -- non-abstract for vs\/HaskellService
   22 
   23          -- * Creation\/destruction
   24         hGetStringBuffer,
   25         hGetStringBufferBlock,
   26         hPutStringBuffer,
   27         appendStringBuffers,
   28         stringToStringBuffer,
   29 
   30         -- * Inspection
   31         nextChar,
   32         currentChar,
   33         prevChar,
   34         atEnd,
   35         fingerprintStringBuffer,
   36 
   37         -- * Moving and comparison
   38         stepOn,
   39         offsetBytes,
   40         byteDiff,
   41         atLine,
   42 
   43         -- * Conversion
   44         lexemeToString,
   45         lexemeToFastString,
   46         decodePrevNChars,
   47 
   48          -- * Parsing integers
   49         parseUnsignedInteger,
   50 
   51         -- * Checking for bi-directional format characters
   52         containsBidirectionalFormatChar,
   53         bidirectionalFormatChars
   54         ) where
   55 
   56 import GHC.Prelude
   57 
   58 import GHC.Data.FastString
   59 import GHC.Utils.Encoding
   60 import GHC.Utils.IO.Unsafe
   61 import GHC.Utils.Panic.Plain
   62 import GHC.Utils.Exception      ( bracket_ )
   63 import GHC.Fingerprint
   64 
   65 import Data.Maybe
   66 import System.IO
   67 import System.IO.Unsafe         ( unsafePerformIO )
   68 import GHC.IO.Encoding.UTF8     ( mkUTF8 )
   69 import GHC.IO.Encoding.Failure  ( CodingFailureMode(IgnoreCodingFailure) )
   70 
   71 import GHC.Exts
   72 
   73 import Foreign
   74 #if MIN_VERSION_base(4,15,0)
   75 import GHC.ForeignPtr (unsafeWithForeignPtr)
   76 #else
   77 unsafeWithForeignPtr :: ForeignPtr a -> (Ptr a -> IO b) -> IO b
   78 unsafeWithForeignPtr = withForeignPtr
   79 #endif
   80 
   81 -- -----------------------------------------------------------------------------
   82 -- The StringBuffer type
   83 
   84 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
   85 -- The bytes are intended to be *immutable*.  There are pure
   86 -- operations to read the contents of a StringBuffer.
   87 --
   88 -- A StringBuffer may have a finalizer, depending on how it was
   89 -- obtained.
   90 --
   91 data StringBuffer
   92  = StringBuffer {
   93      buf :: {-# UNPACK #-} !(ForeignPtr Word8),
   94      len :: {-# UNPACK #-} !Int,        -- length
   95      cur :: {-# UNPACK #-} !Int         -- current pos
   96   }
   97   -- The buffer is assumed to be UTF-8 encoded, and furthermore
   98   -- we add three @\'\\0\'@ bytes to the end as sentinels so that the
   99   -- decoder doesn't have to check for overflow at every single byte
  100   -- of a multibyte sequence.
  101 
  102 instance Show StringBuffer where
  103         showsPrec _ s = showString "<stringbuffer("
  104                       . shows (len s) . showString "," . shows (cur s)
  105                       . showString ")>"
  106 
  107 -- -----------------------------------------------------------------------------
  108 -- Creation / Destruction
  109 
  110 -- | Read a file into a 'StringBuffer'.  The resulting buffer is automatically
  111 -- managed by the garbage collector.
  112 hGetStringBuffer :: FilePath -> IO StringBuffer
  113 hGetStringBuffer fname = do
  114    h <- openBinaryFile fname ReadMode
  115    size_i <- hFileSize h
  116    offset_i <- skipBOM h size_i 0  -- offset is 0 initially
  117    let size = fromIntegral $ size_i - offset_i
  118    buf <- mallocForeignPtrArray (size+3)
  119    unsafeWithForeignPtr buf $ \ptr -> do
  120      r <- if size == 0 then return 0 else hGetBuf h ptr size
  121      hClose h
  122      if (r /= size)
  123         then ioError (userError "short read of file")
  124         else newUTF8StringBuffer buf ptr size
  125 
  126 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
  127 hGetStringBufferBlock handle wanted
  128     = do size_i <- hFileSize handle
  129          offset_i <- hTell handle >>= skipBOM handle size_i
  130          let size = min wanted (fromIntegral $ size_i-offset_i)
  131          buf <- mallocForeignPtrArray (size+3)
  132          unsafeWithForeignPtr buf $ \ptr ->
  133              do r <- if size == 0 then return 0 else hGetBuf handle ptr size
  134                 if r /= size
  135                    then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
  136                    else newUTF8StringBuffer buf ptr size
  137 
  138 hPutStringBuffer :: Handle -> StringBuffer -> IO ()
  139 hPutStringBuffer hdl (StringBuffer buf len cur)
  140     = unsafeWithForeignPtr (plusForeignPtr buf cur) $ \ptr ->
  141           hPutBuf hdl ptr len
  142 
  143 -- | Skip the byte-order mark if there is one (see #1744 and #6016),
  144 -- and return the new position of the handle in bytes.
  145 --
  146 -- This is better than treating #FEFF as whitespace,
  147 -- because that would mess up layout.  We don't have a concept
  148 -- of zero-width whitespace in Haskell: all whitespace codepoints
  149 -- have a width of one column.
  150 skipBOM :: Handle -> Integer -> Integer -> IO Integer
  151 skipBOM h size offset =
  152   -- Only skip BOM at the beginning of a file.
  153   if size > 0 && offset == 0
  154     then do
  155       -- Validate assumption that handle is in binary mode.
  156       assertM (hGetEncoding h >>= return . isNothing)
  157       -- Temporarily select utf8 encoding with error ignoring,
  158       -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
  159       bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
  160         c <- hLookAhead h
  161         if c == '\xfeff'
  162           then hGetChar h >> hTell h
  163           else return offset
  164     else return offset
  165   where
  166     safeEncoding = mkUTF8 IgnoreCodingFailure
  167 
  168 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
  169 newUTF8StringBuffer buf ptr size = do
  170   pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
  171   -- sentinels for UTF-8 decoding
  172   return $ StringBuffer buf size 0
  173 
  174 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
  175 appendStringBuffers sb1 sb2
  176     = do newBuf <- mallocForeignPtrArray (size+3)
  177          unsafeWithForeignPtr newBuf $ \ptr ->
  178           unsafeWithForeignPtr (buf sb1) $ \sb1Ptr ->
  179            unsafeWithForeignPtr (buf sb2) $ \sb2Ptr ->
  180              do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
  181                 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
  182                 pokeArray (ptr `advancePtr` size) [0,0,0]
  183                 return (StringBuffer newBuf size 0)
  184     where sb1_len = calcLen sb1
  185           sb2_len = calcLen sb2
  186           calcLen sb = len sb - cur sb
  187           size =  sb1_len + sb2_len
  188 
  189 -- | Encode a 'String' into a 'StringBuffer' as UTF-8.  The resulting buffer
  190 -- is automatically managed by the garbage collector.
  191 stringToStringBuffer :: String -> StringBuffer
  192 stringToStringBuffer str =
  193  unsafePerformIO $ do
  194   let size = utf8EncodedLength str
  195   buf <- mallocForeignPtrArray (size+3)
  196   unsafeWithForeignPtr buf $ \ptr -> do
  197     utf8EncodeStringPtr ptr str
  198     pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
  199     -- sentinels for UTF-8 decoding
  200   return (StringBuffer buf size 0)
  201 
  202 -- -----------------------------------------------------------------------------
  203 -- Grab a character
  204 
  205 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
  206 -- the remaining portion (analogous to 'Data.List.uncons').  __Warning:__ The
  207 -- behavior is undefined if the 'StringBuffer' is empty.  The result shares
  208 -- the same buffer as the original.  Similar to 'utf8DecodeChar', if the
  209 -- character cannot be decoded as UTF-8, @\'\\0\'@ is returned.
  210 {-# INLINE nextChar #-}
  211 nextChar :: StringBuffer -> (Char,StringBuffer)
  212 nextChar (StringBuffer buf len (I# cur#)) =
  213   -- Getting our fingers dirty a little here, but this is performance-critical
  214   inlinePerformIO $
  215     unsafeWithForeignPtr buf $ \(Ptr a#) ->
  216         case utf8DecodeCharAddr# (a# `plusAddr#` cur#) 0# of
  217           (# c#, nBytes# #) ->
  218              let cur' = I# (cur# +# nBytes#) in
  219              return (C# c#, StringBuffer buf len cur')
  220 
  221 
  222 bidirectionalFormatChars :: [(Char,String)]
  223 bidirectionalFormatChars =
  224   [ ('\x202a' , "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)")
  225   , ('\x202b' , "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)")
  226   , ('\x202c' , "U+202C POP DIRECTIONAL FORMATTING (PDF)")
  227   , ('\x202d' , "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)")
  228   , ('\x202e' , "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)")
  229   , ('\x2066' , "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)")
  230   , ('\x2067' , "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)")
  231   , ('\x2068' , "U+2068 FIRST STRONG ISOLATE (FSI)")
  232   , ('\x2069' , "U+2069 POP DIRECTIONAL ISOLATE (PDI)")
  233   ]
  234 
  235 {-| Returns true if the buffer contains Unicode bi-directional formatting
  236 characters.
  237 
  238 https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types
  239 
  240 Bidirectional format characters are one of
  241 '\x202a' : "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)"
  242 '\x202b' : "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)"
  243 '\x202c' : "U+202C POP DIRECTIONAL FORMATTING (PDF)"
  244 '\x202d' : "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)"
  245 '\x202e' : "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)"
  246 '\x2066' : "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)"
  247 '\x2067' : "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)"
  248 '\x2068' : "U+2068 FIRST STRONG ISOLATE (FSI)"
  249 '\x2069' : "U+2069 POP DIRECTIONAL ISOLATE (PDI)"
  250 
  251 This list is encoded in 'bidirectionalFormatChars'
  252 
  253 -}
  254 {-# INLINE containsBidirectionalFormatChar #-}
  255 containsBidirectionalFormatChar :: StringBuffer -> Bool
  256 containsBidirectionalFormatChar (StringBuffer buf (I# len#) (I# cur#))
  257   = inlinePerformIO $ unsafeWithForeignPtr buf $ \(Ptr a#) -> do
  258   let go :: Int# -> Bool
  259       go i | isTrue# (i >=# len#) = False
  260            | otherwise = case utf8DecodeCharAddr# a# i of
  261                 (# '\x202a'#  , _ #) -> True
  262                 (# '\x202b'#  , _ #) -> True
  263                 (# '\x202c'#  , _ #) -> True
  264                 (# '\x202d'#  , _ #) -> True
  265                 (# '\x202e'#  , _ #) -> True
  266                 (# '\x2066'#  , _ #) -> True
  267                 (# '\x2067'#  , _ #) -> True
  268                 (# '\x2068'#  , _ #) -> True
  269                 (# '\x2069'#  , _ #) -> True
  270                 (# _, bytes #) -> go (i +# bytes)
  271   pure $! go cur#
  272 
  273 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
  274 -- to 'Data.List.head').  __Warning:__ The behavior is undefined if the
  275 -- 'StringBuffer' is empty.  Similar to 'utf8DecodeChar', if the character
  276 -- cannot be decoded as UTF-8, @\'\\0\'@ is returned.
  277 currentChar :: StringBuffer -> Char
  278 currentChar = fst . nextChar
  279 
  280 prevChar :: StringBuffer -> Char -> Char
  281 prevChar (StringBuffer _   _   0)   deflt = deflt
  282 prevChar (StringBuffer buf _   cur) _     =
  283   inlinePerformIO $
  284     unsafeWithForeignPtr buf $ \p -> do
  285       p' <- utf8PrevChar (p `plusPtr` cur)
  286       return (fst (utf8DecodeChar p'))
  287 
  288 -- -----------------------------------------------------------------------------
  289 -- Moving
  290 
  291 -- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
  292 -- to 'Data.List.tail').  __Warning:__ The behavior is undefined if the
  293 -- 'StringBuffer' is empty.  The result shares the same buffer as the
  294 -- original.
  295 stepOn :: StringBuffer -> StringBuffer
  296 stepOn s = snd (nextChar s)
  297 
  298 -- | Return a 'StringBuffer' with the first @n@ bytes removed.  __Warning:__
  299 -- If there aren't enough characters, the returned 'StringBuffer' will be
  300 -- invalid and any use of it may lead to undefined behavior.  The result
  301 -- shares the same buffer as the original.
  302 offsetBytes :: Int                      -- ^ @n@, the number of bytes
  303             -> StringBuffer
  304             -> StringBuffer
  305 offsetBytes i s = s { cur = cur s + i }
  306 
  307 -- | Compute the difference in offset between two 'StringBuffer's that share
  308 -- the same buffer.  __Warning:__ The behavior is undefined if the
  309 -- 'StringBuffer's use separate buffers.
  310 byteDiff :: StringBuffer -> StringBuffer -> Int
  311 byteDiff s1 s2 = cur s2 - cur s1
  312 
  313 -- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
  314 atEnd :: StringBuffer -> Bool
  315 atEnd (StringBuffer _ l c) = l == c
  316 
  317 -- | Computes a hash of the contents of a 'StringBuffer'.
  318 fingerprintStringBuffer :: StringBuffer -> Fingerprint
  319 fingerprintStringBuffer (StringBuffer buf len cur) =
  320   unsafePerformIO $
  321     withForeignPtr buf $ \ptr ->
  322       fingerprintData (ptr `plusPtr` cur) len
  323 
  324 -- | Computes a 'StringBuffer' which points to the first character of the
  325 -- wanted line. Lines begin at 1.
  326 atLine :: Int -> StringBuffer -> Maybe StringBuffer
  327 atLine line sb@(StringBuffer buf len _) =
  328   inlinePerformIO $
  329     unsafeWithForeignPtr buf $ \p -> do
  330       p' <- skipToLine line len p
  331       if p' == nullPtr
  332         then return Nothing
  333         else
  334           let
  335             delta = p' `minusPtr` p
  336           in return $ Just (sb { cur = delta
  337                                , len = len - delta
  338                                })
  339 
  340 skipToLine :: Int -> Int -> Ptr Word8 -> IO (Ptr Word8)
  341 skipToLine !line !len !op0 = go 1 op0
  342   where
  343     !opend = op0 `plusPtr` len
  344 
  345     go !i_line !op
  346       | op >= opend    = pure nullPtr
  347       | i_line == line = pure op
  348       | otherwise      = do
  349           w <- peek op :: IO Word8
  350           case w of
  351             10 -> go (i_line + 1) (plusPtr op 1)
  352             13 -> do
  353               -- this is safe because a 'StringBuffer' is
  354               -- guaranteed to have 3 bytes sentinel values.
  355               w' <- peek (plusPtr op 1) :: IO Word8
  356               case w' of
  357                 10 -> go (i_line + 1) (plusPtr op 2)
  358                 _  -> go (i_line + 1) (plusPtr op 1)
  359             _  -> go i_line (plusPtr op 1)
  360 
  361 -- -----------------------------------------------------------------------------
  362 -- Conversion
  363 
  364 -- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
  365 -- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
  366 -- they will be replaced with @\'\\0\'@.
  367 lexemeToString :: StringBuffer
  368                -> Int                   -- ^ @n@, the number of bytes
  369                -> String
  370 lexemeToString _ 0 = ""
  371 lexemeToString (StringBuffer buf _ cur) bytes =
  372   utf8DecodeStringLazy buf cur bytes
  373 
  374 lexemeToFastString :: StringBuffer
  375                    -> Int               -- ^ @n@, the number of bytes
  376                    -> FastString
  377 lexemeToFastString _ 0 = nilFS
  378 lexemeToFastString (StringBuffer buf _ cur) len =
  379    inlinePerformIO $
  380      unsafeWithForeignPtr buf $ \ptr ->
  381        return $! mkFastStringBytes (ptr `plusPtr` cur) len
  382 
  383 -- | Return the previous @n@ characters (or fewer if we are less than @n@
  384 -- characters into the buffer.
  385 decodePrevNChars :: Int -> StringBuffer -> String
  386 decodePrevNChars n (StringBuffer buf _ cur) =
  387     inlinePerformIO $ unsafeWithForeignPtr buf $ \p0 ->
  388       go p0 n "" (p0 `plusPtr` (cur - 1))
  389   where
  390     go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String
  391     go buf0 n acc p | n == 0 || buf0 >= p = return acc
  392     go buf0 n acc p = do
  393         p' <- utf8PrevChar p
  394         let (c,_) = utf8DecodeChar p'
  395         go buf0 (n - 1) (c:acc) p'
  396 
  397 -- -----------------------------------------------------------------------------
  398 -- Parsing integer strings in various bases
  399 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
  400 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
  401   = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
  402     go i x | i == len  = x
  403            | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
  404                '_'  -> go (i + 1) x    -- skip "_" (#14473)
  405                char -> go (i + 1) (x * radix + toInteger (char_to_int char))
  406   in go 0 0