never executed always true always false
1 {-
2 (c) The University of Glasgow 2006
3 (c) The University of Glasgow, 1997-2006
4
5
6 Buffers for scanning string input stored in external arrays.
7 -}
8
9 {-# LANGUAGE BangPatterns #-}
10 {-# LANGUAGE CPP #-}
11 {-# LANGUAGE MagicHash #-}
12 {-# LANGUAGE UnboxedTuples #-}
13
14 {-# OPTIONS_GHC -O2 #-}
15 -- We always optimise this, otherwise performance of a non-optimised
16 -- compiler is severely affected
17
18 module GHC.Data.StringBuffer
19 (
20 StringBuffer(..),
21 -- non-abstract for vs\/HaskellService
22
23 -- * Creation\/destruction
24 hGetStringBuffer,
25 hGetStringBufferBlock,
26 hPutStringBuffer,
27 appendStringBuffers,
28 stringToStringBuffer,
29
30 -- * Inspection
31 nextChar,
32 currentChar,
33 prevChar,
34 atEnd,
35 fingerprintStringBuffer,
36
37 -- * Moving and comparison
38 stepOn,
39 offsetBytes,
40 byteDiff,
41 atLine,
42
43 -- * Conversion
44 lexemeToString,
45 lexemeToFastString,
46 decodePrevNChars,
47
48 -- * Parsing integers
49 parseUnsignedInteger,
50
51 -- * Checking for bi-directional format characters
52 containsBidirectionalFormatChar,
53 bidirectionalFormatChars
54 ) where
55
56 import GHC.Prelude
57
58 import GHC.Data.FastString
59 import GHC.Utils.Encoding
60 import GHC.Utils.IO.Unsafe
61 import GHC.Utils.Panic.Plain
62 import GHC.Utils.Exception ( bracket_ )
63 import GHC.Fingerprint
64
65 import Data.Maybe
66 import System.IO
67 import System.IO.Unsafe ( unsafePerformIO )
68 import GHC.IO.Encoding.UTF8 ( mkUTF8 )
69 import GHC.IO.Encoding.Failure ( CodingFailureMode(IgnoreCodingFailure) )
70
71 import GHC.Exts
72
73 import Foreign
74 #if MIN_VERSION_base(4,15,0)
75 import GHC.ForeignPtr (unsafeWithForeignPtr)
76 #else
77 unsafeWithForeignPtr :: ForeignPtr a -> (Ptr a -> IO b) -> IO b
78 unsafeWithForeignPtr = withForeignPtr
79 #endif
80
81 -- -----------------------------------------------------------------------------
82 -- The StringBuffer type
83
84 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
85 -- The bytes are intended to be *immutable*. There are pure
86 -- operations to read the contents of a StringBuffer.
87 --
88 -- A StringBuffer may have a finalizer, depending on how it was
89 -- obtained.
90 --
91 data StringBuffer
92 = StringBuffer {
93 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
94 len :: {-# UNPACK #-} !Int, -- length
95 cur :: {-# UNPACK #-} !Int -- current pos
96 }
97 -- The buffer is assumed to be UTF-8 encoded, and furthermore
98 -- we add three @\'\\0\'@ bytes to the end as sentinels so that the
99 -- decoder doesn't have to check for overflow at every single byte
100 -- of a multibyte sequence.
101
102 instance Show StringBuffer where
103 showsPrec _ s = showString "<stringbuffer("
104 . shows (len s) . showString "," . shows (cur s)
105 . showString ")>"
106
107 -- -----------------------------------------------------------------------------
108 -- Creation / Destruction
109
110 -- | Read a file into a 'StringBuffer'. The resulting buffer is automatically
111 -- managed by the garbage collector.
112 hGetStringBuffer :: FilePath -> IO StringBuffer
113 hGetStringBuffer fname = do
114 h <- openBinaryFile fname ReadMode
115 size_i <- hFileSize h
116 offset_i <- skipBOM h size_i 0 -- offset is 0 initially
117 let size = fromIntegral $ size_i - offset_i
118 buf <- mallocForeignPtrArray (size+3)
119 unsafeWithForeignPtr buf $ \ptr -> do
120 r <- if size == 0 then return 0 else hGetBuf h ptr size
121 hClose h
122 if (r /= size)
123 then ioError (userError "short read of file")
124 else newUTF8StringBuffer buf ptr size
125
126 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
127 hGetStringBufferBlock handle wanted
128 = do size_i <- hFileSize handle
129 offset_i <- hTell handle >>= skipBOM handle size_i
130 let size = min wanted (fromIntegral $ size_i-offset_i)
131 buf <- mallocForeignPtrArray (size+3)
132 unsafeWithForeignPtr buf $ \ptr ->
133 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
134 if r /= size
135 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
136 else newUTF8StringBuffer buf ptr size
137
138 hPutStringBuffer :: Handle -> StringBuffer -> IO ()
139 hPutStringBuffer hdl (StringBuffer buf len cur)
140 = unsafeWithForeignPtr (plusForeignPtr buf cur) $ \ptr ->
141 hPutBuf hdl ptr len
142
143 -- | Skip the byte-order mark if there is one (see #1744 and #6016),
144 -- and return the new position of the handle in bytes.
145 --
146 -- This is better than treating #FEFF as whitespace,
147 -- because that would mess up layout. We don't have a concept
148 -- of zero-width whitespace in Haskell: all whitespace codepoints
149 -- have a width of one column.
150 skipBOM :: Handle -> Integer -> Integer -> IO Integer
151 skipBOM h size offset =
152 -- Only skip BOM at the beginning of a file.
153 if size > 0 && offset == 0
154 then do
155 -- Validate assumption that handle is in binary mode.
156 assertM (hGetEncoding h >>= return . isNothing)
157 -- Temporarily select utf8 encoding with error ignoring,
158 -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
159 bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
160 c <- hLookAhead h
161 if c == '\xfeff'
162 then hGetChar h >> hTell h
163 else return offset
164 else return offset
165 where
166 safeEncoding = mkUTF8 IgnoreCodingFailure
167
168 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
169 newUTF8StringBuffer buf ptr size = do
170 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
171 -- sentinels for UTF-8 decoding
172 return $ StringBuffer buf size 0
173
174 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
175 appendStringBuffers sb1 sb2
176 = do newBuf <- mallocForeignPtrArray (size+3)
177 unsafeWithForeignPtr newBuf $ \ptr ->
178 unsafeWithForeignPtr (buf sb1) $ \sb1Ptr ->
179 unsafeWithForeignPtr (buf sb2) $ \sb2Ptr ->
180 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
181 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
182 pokeArray (ptr `advancePtr` size) [0,0,0]
183 return (StringBuffer newBuf size 0)
184 where sb1_len = calcLen sb1
185 sb2_len = calcLen sb2
186 calcLen sb = len sb - cur sb
187 size = sb1_len + sb2_len
188
189 -- | Encode a 'String' into a 'StringBuffer' as UTF-8. The resulting buffer
190 -- is automatically managed by the garbage collector.
191 stringToStringBuffer :: String -> StringBuffer
192 stringToStringBuffer str =
193 unsafePerformIO $ do
194 let size = utf8EncodedLength str
195 buf <- mallocForeignPtrArray (size+3)
196 unsafeWithForeignPtr buf $ \ptr -> do
197 utf8EncodeStringPtr ptr str
198 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
199 -- sentinels for UTF-8 decoding
200 return (StringBuffer buf size 0)
201
202 -- -----------------------------------------------------------------------------
203 -- Grab a character
204
205 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
206 -- the remaining portion (analogous to 'Data.List.uncons'). __Warning:__ The
207 -- behavior is undefined if the 'StringBuffer' is empty. The result shares
208 -- the same buffer as the original. Similar to 'utf8DecodeChar', if the
209 -- character cannot be decoded as UTF-8, @\'\\0\'@ is returned.
210 {-# INLINE nextChar #-}
211 nextChar :: StringBuffer -> (Char,StringBuffer)
212 nextChar (StringBuffer buf len (I# cur#)) =
213 -- Getting our fingers dirty a little here, but this is performance-critical
214 inlinePerformIO $
215 unsafeWithForeignPtr buf $ \(Ptr a#) ->
216 case utf8DecodeCharAddr# (a# `plusAddr#` cur#) 0# of
217 (# c#, nBytes# #) ->
218 let cur' = I# (cur# +# nBytes#) in
219 return (C# c#, StringBuffer buf len cur')
220
221
222 bidirectionalFormatChars :: [(Char,String)]
223 bidirectionalFormatChars =
224 [ ('\x202a' , "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)")
225 , ('\x202b' , "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)")
226 , ('\x202c' , "U+202C POP DIRECTIONAL FORMATTING (PDF)")
227 , ('\x202d' , "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)")
228 , ('\x202e' , "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)")
229 , ('\x2066' , "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)")
230 , ('\x2067' , "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)")
231 , ('\x2068' , "U+2068 FIRST STRONG ISOLATE (FSI)")
232 , ('\x2069' , "U+2069 POP DIRECTIONAL ISOLATE (PDI)")
233 ]
234
235 {-| Returns true if the buffer contains Unicode bi-directional formatting
236 characters.
237
238 https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types
239
240 Bidirectional format characters are one of
241 '\x202a' : "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)"
242 '\x202b' : "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)"
243 '\x202c' : "U+202C POP DIRECTIONAL FORMATTING (PDF)"
244 '\x202d' : "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)"
245 '\x202e' : "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)"
246 '\x2066' : "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)"
247 '\x2067' : "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)"
248 '\x2068' : "U+2068 FIRST STRONG ISOLATE (FSI)"
249 '\x2069' : "U+2069 POP DIRECTIONAL ISOLATE (PDI)"
250
251 This list is encoded in 'bidirectionalFormatChars'
252
253 -}
254 {-# INLINE containsBidirectionalFormatChar #-}
255 containsBidirectionalFormatChar :: StringBuffer -> Bool
256 containsBidirectionalFormatChar (StringBuffer buf (I# len#) (I# cur#))
257 = inlinePerformIO $ unsafeWithForeignPtr buf $ \(Ptr a#) -> do
258 let go :: Int# -> Bool
259 go i | isTrue# (i >=# len#) = False
260 | otherwise = case utf8DecodeCharAddr# a# i of
261 (# '\x202a'# , _ #) -> True
262 (# '\x202b'# , _ #) -> True
263 (# '\x202c'# , _ #) -> True
264 (# '\x202d'# , _ #) -> True
265 (# '\x202e'# , _ #) -> True
266 (# '\x2066'# , _ #) -> True
267 (# '\x2067'# , _ #) -> True
268 (# '\x2068'# , _ #) -> True
269 (# '\x2069'# , _ #) -> True
270 (# _, bytes #) -> go (i +# bytes)
271 pure $! go cur#
272
273 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
274 -- to 'Data.List.head'). __Warning:__ The behavior is undefined if the
275 -- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character
276 -- cannot be decoded as UTF-8, @\'\\0\'@ is returned.
277 currentChar :: StringBuffer -> Char
278 currentChar = fst . nextChar
279
280 prevChar :: StringBuffer -> Char -> Char
281 prevChar (StringBuffer _ _ 0) deflt = deflt
282 prevChar (StringBuffer buf _ cur) _ =
283 inlinePerformIO $
284 unsafeWithForeignPtr buf $ \p -> do
285 p' <- utf8PrevChar (p `plusPtr` cur)
286 return (fst (utf8DecodeChar p'))
287
288 -- -----------------------------------------------------------------------------
289 -- Moving
290
291 -- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
292 -- to 'Data.List.tail'). __Warning:__ The behavior is undefined if the
293 -- 'StringBuffer' is empty. The result shares the same buffer as the
294 -- original.
295 stepOn :: StringBuffer -> StringBuffer
296 stepOn s = snd (nextChar s)
297
298 -- | Return a 'StringBuffer' with the first @n@ bytes removed. __Warning:__
299 -- If there aren't enough characters, the returned 'StringBuffer' will be
300 -- invalid and any use of it may lead to undefined behavior. The result
301 -- shares the same buffer as the original.
302 offsetBytes :: Int -- ^ @n@, the number of bytes
303 -> StringBuffer
304 -> StringBuffer
305 offsetBytes i s = s { cur = cur s + i }
306
307 -- | Compute the difference in offset between two 'StringBuffer's that share
308 -- the same buffer. __Warning:__ The behavior is undefined if the
309 -- 'StringBuffer's use separate buffers.
310 byteDiff :: StringBuffer -> StringBuffer -> Int
311 byteDiff s1 s2 = cur s2 - cur s1
312
313 -- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
314 atEnd :: StringBuffer -> Bool
315 atEnd (StringBuffer _ l c) = l == c
316
317 -- | Computes a hash of the contents of a 'StringBuffer'.
318 fingerprintStringBuffer :: StringBuffer -> Fingerprint
319 fingerprintStringBuffer (StringBuffer buf len cur) =
320 unsafePerformIO $
321 withForeignPtr buf $ \ptr ->
322 fingerprintData (ptr `plusPtr` cur) len
323
324 -- | Computes a 'StringBuffer' which points to the first character of the
325 -- wanted line. Lines begin at 1.
326 atLine :: Int -> StringBuffer -> Maybe StringBuffer
327 atLine line sb@(StringBuffer buf len _) =
328 inlinePerformIO $
329 unsafeWithForeignPtr buf $ \p -> do
330 p' <- skipToLine line len p
331 if p' == nullPtr
332 then return Nothing
333 else
334 let
335 delta = p' `minusPtr` p
336 in return $ Just (sb { cur = delta
337 , len = len - delta
338 })
339
340 skipToLine :: Int -> Int -> Ptr Word8 -> IO (Ptr Word8)
341 skipToLine !line !len !op0 = go 1 op0
342 where
343 !opend = op0 `plusPtr` len
344
345 go !i_line !op
346 | op >= opend = pure nullPtr
347 | i_line == line = pure op
348 | otherwise = do
349 w <- peek op :: IO Word8
350 case w of
351 10 -> go (i_line + 1) (plusPtr op 1)
352 13 -> do
353 -- this is safe because a 'StringBuffer' is
354 -- guaranteed to have 3 bytes sentinel values.
355 w' <- peek (plusPtr op 1) :: IO Word8
356 case w' of
357 10 -> go (i_line + 1) (plusPtr op 2)
358 _ -> go (i_line + 1) (plusPtr op 1)
359 _ -> go i_line (plusPtr op 1)
360
361 -- -----------------------------------------------------------------------------
362 -- Conversion
363
364 -- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
365 -- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
366 -- they will be replaced with @\'\\0\'@.
367 lexemeToString :: StringBuffer
368 -> Int -- ^ @n@, the number of bytes
369 -> String
370 lexemeToString _ 0 = ""
371 lexemeToString (StringBuffer buf _ cur) bytes =
372 utf8DecodeStringLazy buf cur bytes
373
374 lexemeToFastString :: StringBuffer
375 -> Int -- ^ @n@, the number of bytes
376 -> FastString
377 lexemeToFastString _ 0 = nilFS
378 lexemeToFastString (StringBuffer buf _ cur) len =
379 inlinePerformIO $
380 unsafeWithForeignPtr buf $ \ptr ->
381 return $! mkFastStringBytes (ptr `plusPtr` cur) len
382
383 -- | Return the previous @n@ characters (or fewer if we are less than @n@
384 -- characters into the buffer.
385 decodePrevNChars :: Int -> StringBuffer -> String
386 decodePrevNChars n (StringBuffer buf _ cur) =
387 inlinePerformIO $ unsafeWithForeignPtr buf $ \p0 ->
388 go p0 n "" (p0 `plusPtr` (cur - 1))
389 where
390 go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String
391 go buf0 n acc p | n == 0 || buf0 >= p = return acc
392 go buf0 n acc p = do
393 p' <- utf8PrevChar p
394 let (c,_) = utf8DecodeChar p'
395 go buf0 (n - 1) (c:acc) p'
396
397 -- -----------------------------------------------------------------------------
398 -- Parsing integer strings in various bases
399 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
400 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
401 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
402 go i x | i == len = x
403 | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
404 '_' -> go (i + 1) x -- skip "_" (#14473)
405 char -> go (i + 1) (x * radix + toInteger (char_to_int char))
406 in go 0 0