never executed always true always false
    1 {-# LANGUAGE BangPatterns, RecordWildCards, GADTs #-}
    2 module GHC.Cmm.LayoutStack (
    3        cmmLayoutStack, setInfoTableStackMap
    4   ) where
    5 
    6 import GHC.Prelude hiding ((<*>))
    7 
    8 import GHC.Platform
    9 import GHC.Platform.Profile
   10 
   11 import GHC.StgToCmm.Monad      ( newTemp  ) -- XXX layering violation
   12 import GHC.StgToCmm.Utils      ( callerSaveVolatileRegs  ) -- XXX layering violation
   13 import GHC.StgToCmm.Foreign    ( saveThreadState, loadThreadState ) -- XXX layering violation
   14 
   15 import GHC.Cmm
   16 import GHC.Cmm.Info
   17 import GHC.Cmm.BlockId
   18 import GHC.Cmm.Utils
   19 import GHC.Cmm.Graph
   20 import GHC.Cmm.Liveness
   21 import GHC.Cmm.ProcPoint
   22 import GHC.Runtime.Heap.Layout
   23 import GHC.Cmm.Dataflow.Block
   24 import GHC.Cmm.Dataflow.Collections
   25 import GHC.Cmm.Dataflow
   26 import GHC.Cmm.Dataflow.Graph
   27 import GHC.Cmm.Dataflow.Label
   28 import GHC.Types.Unique.Supply
   29 import GHC.Data.Maybe
   30 import GHC.Types.Unique.FM
   31 import GHC.Utils.Misc
   32 
   33 import GHC.Driver.Session
   34 import GHC.Utils.Outputable hiding ( isEmpty )
   35 import GHC.Utils.Panic
   36 import qualified Data.Set as Set
   37 import Control.Monad.Fix
   38 import Data.Array as Array
   39 import Data.List (nub)
   40 
   41 {- Note [Stack Layout]
   42 
   43 The job of this pass is to
   44 
   45  - replace references to abstract stack Areas with fixed offsets from Sp.
   46 
   47  - replace the CmmHighStackMark constant used in the stack check with
   48    the maximum stack usage of the proc.
   49 
   50  - save any variables that are live across a call, and reload them as
   51    necessary.
   52 
   53 Before stack allocation, local variables remain live across native
   54 calls (CmmCall{ cmm_cont = Just _ }), and after stack allocation local
   55 variables are clobbered by native calls.
   56 
   57 We want to do stack allocation so that as far as possible
   58  - stack use is minimized, and
   59  - unnecessary stack saves and loads are avoided.
   60 
   61 The algorithm we use is a variant of linear-scan register allocation,
   62 where the stack is our register file.
   63 
   64 We proceed in two passes, see Note [Two pass approach] for why they are not easy
   65 to merge into one.
   66 
   67 Pass 1:
   68 
   69  - First, we do a liveness analysis, which annotates every block with
   70    the variables live on entry to the block.
   71 
   72  - We traverse blocks in reverse postorder DFS; that is, we visit at
   73    least one predecessor of a block before the block itself.  The
   74    stack layout flowing from the predecessor of the block will
   75    determine the stack layout on entry to the block.
   76 
   77  - We maintain a data structure
   78 
   79      Map Label StackMap
   80 
   81    which describes the contents of the stack and the stack pointer on
   82    entry to each block that is a successor of a block that we have
   83    visited.
   84 
   85  - For each block we visit:
   86 
   87     - Look up the StackMap for this block.
   88 
   89     - If this block is a proc point (or a call continuation, if we aren't
   90       splitting proc points), we need to reload all the live variables from the
   91       stack - but this is done in Pass 2, which calculates more precise liveness
   92       information (see description of Pass 2).
   93 
   94     - Walk forwards through the instructions:
   95       - At an assignment  x = Sp[loc]
   96         - Record the fact that Sp[loc] contains x, so that we won't
   97           need to save x if it ever needs to be spilled.
   98       - At an assignment  x = E
   99         - If x was previously on the stack, it isn't any more
  100       - At the last node, if it is a call or a jump to a proc point
  101         - Lay out the stack frame for the call (see setupStackFrame)
  102         - emit instructions to save all the live variables
  103         - Remember the StackMaps for all the successors
  104         - emit an instruction to adjust Sp
  105       - If the last node is a branch, then the current StackMap is the
  106         StackMap for the successors.
  107 
  108     - Manifest Sp: replace references to stack areas in this block
  109       with real Sp offsets. We cannot do this until we have laid out
  110       the stack area for the successors above.
  111 
  112       In this phase we also eliminate redundant stores to the stack;
  113       see elimStackStores.
  114 
  115   - There is one important gotcha: sometimes we'll encounter a control
  116     transfer to a block that we've already processed (a join point),
  117     and in that case we might need to rearrange the stack to match
  118     what the block is expecting. (exactly the same as in linear-scan
  119     register allocation, except here we have the luxury of an infinite
  120     supply of temporary variables).
  121 
  122   - Finally, we update the magic CmmHighStackMark constant with the
  123     stack usage of the function, and eliminate the whole stack check
  124     if there was no stack use. (in fact this is done as part of the
  125     main traversal, by feeding the high-water-mark output back in as
  126     an input. I hate cyclic programming, but it's just too convenient
  127     sometimes.)
  128 
  129   There are plenty of tricky details: update frames, proc points, return
  130   addresses, foreign calls, and some ad-hoc optimisations that are
  131   convenient to do here and effective in common cases.  Comments in the
  132   code below explain these.
  133 
  134 Pass 2:
  135 
  136 - Calculate live registers, but taking into account that nothing is live at the
  137   entry to a proc point.
  138 
  139 - At each proc point and call continuation insert reloads of live registers from
  140   the stack (they were saved by Pass 1).
  141 
  142 
  143 Note [Two pass approach]
  144 
  145 The main reason for Pass 2 is being able to insert only the reloads that are
  146 needed and the fact that the two passes need different liveness information.
  147 Let's consider an example:
  148 
  149   .....
  150    \ /
  151     D   <- proc point
  152    / \
  153   E   F
  154    \ /
  155     G   <- proc point
  156     |
  157     X
  158 
  159 Pass 1 needs liveness assuming that local variables are preserved across calls.
  160 This is important because it needs to save any local registers to the stack
  161 (e.g., if register a is used in block X, it must be saved before any native
  162 call).
  163 However, for Pass 2, where we want to reload registers from stack (in a proc
  164 point), this is overly conservative and would lead us to generate reloads in D
  165 for things used in X, even though we're going to generate reloads in G anyway
  166 (since it's also a proc point).
  167 So Pass 2 calculates liveness knowing that nothing is live at the entry to a
  168 proc point. This means that in D we only need to reload things used in E or F.
  169 This can be quite important, for an extreme example see testcase for #3294.
  170 
  171 Merging the two passes is not trivial - Pass 2 is a backward rewrite and Pass 1
  172 is a forward one. Furthermore, Pass 1 is creating code that uses local registers
  173 (saving them before a call), which the liveness analysis for Pass 2 must see to
  174 be correct.
  175 
  176 -}
  177 
  178 
  179 -- All stack locations are expressed as positive byte offsets from the
  180 -- "base", which is defined to be the address above the return address
  181 -- on the stack on entry to this CmmProc.
  182 --
  183 -- Lower addresses have higher StackLocs.
  184 --
  185 type StackLoc = ByteOff
  186 
  187 {-
  188  A StackMap describes the stack at any given point.  At a continuation
  189  it has a particular layout, like this:
  190 
  191          |             | <- base
  192          |-------------|
  193          |     ret0    | <- base + 8
  194          |-------------|
  195          .  upd frame  . <- base + sm_ret_off
  196          |-------------|
  197          |             |
  198          .    vars     .
  199          . (live/dead) .
  200          |             | <- base + sm_sp - sm_args
  201          |-------------|
  202          |    ret1     |
  203          .  ret vals   . <- base + sm_sp    (<--- Sp points here)
  204          |-------------|
  205 
  206 Why do we include the final return address (ret0) in our stack map?  I
  207 have absolutely no idea, but it seems to be done that way consistently
  208 in the rest of the code generator, so I played along here. --SDM
  209 
  210 Note that we will be constructing an info table for the continuation
  211 (ret1), which needs to describe the stack down to, but not including,
  212 the update frame (or ret0, if there is no update frame).
  213 -}
  214 
  215 data StackMap = StackMap
  216  {  sm_sp   :: StackLoc
  217        -- ^ the offset of Sp relative to the base on entry
  218        -- to this block.
  219  ,  sm_args :: ByteOff
  220        -- ^ the number of bytes of arguments in the area for this block
  221        -- Defn: the offset of young(L) relative to the base is given by
  222        -- (sm_sp - sm_args) of the StackMap for block L.
  223  ,  sm_ret_off :: ByteOff
  224        -- ^ Number of words of stack that we do not describe with an info
  225        -- table, because it contains an update frame.
  226  ,  sm_regs :: UniqFM LocalReg (LocalReg,StackLoc)
  227        -- ^ regs on the stack
  228  }
  229 
  230 instance Outputable StackMap where
  231   ppr StackMap{..} =
  232      text "Sp = " <> int sm_sp $$
  233      text "sm_args = " <> int sm_args $$
  234      text "sm_ret_off = " <> int sm_ret_off $$
  235      text "sm_regs = " <> pprUFM sm_regs ppr
  236 
  237 
  238 cmmLayoutStack :: DynFlags -> ProcPointSet -> ByteOff -> CmmGraph
  239                -> UniqSM (CmmGraph, LabelMap StackMap)
  240 cmmLayoutStack dflags procpoints entry_args
  241                graph@(CmmGraph { g_entry = entry })
  242   = do
  243     -- We need liveness info. Dead assignments are removed later
  244     -- by the sinking pass.
  245     let liveness = cmmLocalLiveness platform graph
  246         blocks = revPostorder graph
  247         profile  = targetProfile dflags
  248         platform = profilePlatform profile
  249 
  250     (final_stackmaps, _final_high_sp, new_blocks) <-
  251           mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
  252             layout dflags procpoints liveness entry entry_args
  253                    rec_stackmaps rec_high_sp blocks
  254 
  255     blocks_with_reloads <-
  256         insertReloadsAsNeeded platform procpoints final_stackmaps entry new_blocks
  257     new_blocks' <- mapM (lowerSafeForeignCall profile) blocks_with_reloads
  258     return (ofBlockList entry new_blocks', final_stackmaps)
  259 
  260 -- -----------------------------------------------------------------------------
  261 -- Pass 1
  262 -- -----------------------------------------------------------------------------
  263 
  264 layout :: DynFlags
  265        -> LabelSet                      -- proc points
  266        -> LabelMap CmmLocalLive         -- liveness
  267        -> BlockId                       -- entry
  268        -> ByteOff                       -- stack args on entry
  269 
  270        -> LabelMap StackMap             -- [final] stack maps
  271        -> ByteOff                       -- [final] Sp high water mark
  272 
  273        -> [CmmBlock]                    -- [in] blocks
  274 
  275        -> UniqSM
  276           ( LabelMap StackMap           -- [out] stack maps
  277           , ByteOff                     -- [out] Sp high water mark
  278           , [CmmBlock]                  -- [out] new blocks
  279           )
  280 
  281 layout dflags procpoints liveness entry entry_args final_stackmaps final_sp_high blocks
  282   = go blocks init_stackmap entry_args []
  283   where
  284     (updfr, cont_info)  = collectContInfo blocks
  285 
  286     init_stackmap = mapSingleton entry StackMap{ sm_sp   = entry_args
  287                                                , sm_args = entry_args
  288                                                , sm_ret_off = updfr
  289                                                , sm_regs = emptyUFM
  290                                                }
  291 
  292     go [] acc_stackmaps acc_hwm acc_blocks
  293       = return (acc_stackmaps, acc_hwm, acc_blocks)
  294 
  295     go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
  296       = do
  297        let (entry0@(CmmEntry entry_lbl tscope), middle0, last0) = blockSplit b0
  298 
  299        let stack0@StackMap { sm_sp = sp0 }
  300                = mapFindWithDefault
  301                      (pprPanic "no stack map for" (ppr entry_lbl))
  302                      entry_lbl acc_stackmaps
  303 
  304        -- (a) Update the stack map to include the effects of
  305        --     assignments in this block
  306        let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
  307 
  308        -- (b) Look at the last node and if we are making a call or
  309        --     jumping to a proc point, we must save the live
  310        --     variables, adjust Sp, and construct the StackMaps for
  311        --     each of the successor blocks.  See handleLastNode for
  312        --     details.
  313        (middle1, sp_off, last1, fixup_blocks, out)
  314            <- handleLastNode dflags procpoints liveness cont_info
  315                              acc_stackmaps stack1 tscope middle0 last0
  316 
  317        -- (c) Manifest Sp: run over the nodes in the block and replace
  318        --     CmmStackSlot with CmmLoad from Sp with a concrete offset.
  319        --
  320        -- our block:
  321        --    middle0          -- the original middle nodes
  322        --    middle1          -- live variable saves from handleLastNode
  323        --    Sp = Sp + sp_off -- Sp adjustment goes here
  324        --    last1            -- the last node
  325        --
  326        let middle_pre = blockToList $ foldl' blockSnoc middle0 middle1
  327 
  328        let final_blocks =
  329                manifestSp dflags final_stackmaps stack0 sp0 final_sp_high
  330                           entry0 middle_pre sp_off last1 fixup_blocks
  331 
  332        let acc_stackmaps' = mapUnion acc_stackmaps out
  333 
  334            -- If this block jumps to the GC, then we do not take its
  335            -- stack usage into account for the high-water mark.
  336            -- Otherwise, if the only stack usage is in the stack-check
  337            -- failure block itself, we will do a redundant stack
  338            -- check.  The stack has a buffer designed to accommodate
  339            -- the largest amount of stack needed for calling the GC.
  340            --
  341            this_sp_hwm | isGcJump last0 = 0
  342                        | otherwise      = sp0 - sp_off
  343 
  344            hwm' = maximum (acc_hwm : this_sp_hwm : map sm_sp (mapElems out))
  345 
  346        go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
  347 
  348 
  349 -- -----------------------------------------------------------------------------
  350 
  351 -- Not foolproof, but GCFun is the culprit we most want to catch
  352 isGcJump :: CmmNode O C -> Bool
  353 isGcJump (CmmCall { cml_target = CmmReg (CmmGlobal l) })
  354   = l == GCFun || l == GCEnter1
  355 isGcJump _something_else = False
  356 
  357 -- -----------------------------------------------------------------------------
  358 
  359 -- This doesn't seem right somehow.  We need to find out whether this
  360 -- proc will push some update frame material at some point, so that we
  361 -- can avoid using that area of the stack for spilling. Ideally we would
  362 -- capture this information in the CmmProc (e.g. in CmmStackInfo; see #18232
  363 -- for details on one ill-fated attempt at this).
  364 --
  365 -- So we'll just take the max of all the cml_ret_offs.  This could be
  366 -- unnecessarily pessimistic, but probably not in the code we
  367 -- generate.
  368 
  369 collectContInfo :: [CmmBlock] -> (ByteOff, LabelMap ByteOff)
  370 collectContInfo blocks
  371   = (maximum ret_offs, mapFromList (catMaybes mb_argss))
  372  where
  373   (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
  374 
  375   get_cont :: Block CmmNode x C -> (Maybe (Label, ByteOff), ByteOff)
  376   get_cont b =
  377      case lastNode b of
  378         CmmCall { cml_cont = Just l, .. }
  379            -> (Just (l, cml_ret_args), cml_ret_off)
  380         CmmForeignCall { .. }
  381            -> (Just (succ, ret_args), ret_off)
  382         _other -> (Nothing, 0)
  383 
  384 
  385 -- -----------------------------------------------------------------------------
  386 -- Updating the StackMap from middle nodes
  387 
  388 -- Look for loads from stack slots, and update the StackMap.  This is
  389 -- purely for optimisation reasons, so that we can avoid saving a
  390 -- variable back to a different stack slot if it is already on the
  391 -- stack.
  392 --
  393 -- This happens a lot: for example when function arguments are passed
  394 -- on the stack and need to be immediately saved across a call, we
  395 -- want to just leave them where they are on the stack.
  396 --
  397 procMiddle :: LabelMap StackMap -> CmmNode e x -> StackMap -> StackMap
  398 procMiddle stackmaps node sm
  399   = case node of
  400      CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
  401        -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
  402         where loc = getStackLoc area off stackmaps
  403      CmmAssign (CmmLocal r) _other
  404        -> sm { sm_regs = delFromUFM (sm_regs sm) r }
  405      _other
  406        -> sm
  407 
  408 getStackLoc :: Area -> ByteOff -> LabelMap StackMap -> StackLoc
  409 getStackLoc Old       n _         = n
  410 getStackLoc (Young l) n stackmaps =
  411   case mapLookup l stackmaps of
  412     Nothing -> pprPanic "getStackLoc" (ppr l)
  413     Just sm -> sm_sp sm - sm_args sm + n
  414 
  415 
  416 -- -----------------------------------------------------------------------------
  417 -- Handling stack allocation for a last node
  418 
  419 -- We take a single last node and turn it into:
  420 --
  421 --    C1 (some statements)
  422 --    Sp = Sp + N
  423 --    C2 (some more statements)
  424 --    call f()          -- the actual last node
  425 --
  426 -- plus possibly some more blocks (we may have to add some fixup code
  427 -- between the last node and the continuation).
  428 --
  429 -- C1: is the code for saving the variables across this last node onto
  430 -- the stack, if the continuation is a call or jumps to a proc point.
  431 --
  432 -- C2: if the last node is a safe foreign call, we have to inject some
  433 -- extra code that goes *after* the Sp adjustment.
  434 
  435 handleLastNode
  436    :: DynFlags -> ProcPointSet -> LabelMap CmmLocalLive -> LabelMap ByteOff
  437    -> LabelMap StackMap -> StackMap -> CmmTickScope
  438    -> Block CmmNode O O
  439    -> CmmNode O C
  440    -> UniqSM
  441       ( [CmmNode O O]      -- nodes to go *before* the Sp adjustment
  442       , ByteOff            -- amount to adjust Sp
  443       , CmmNode O C        -- new last node
  444       , [CmmBlock]         -- new blocks
  445       , LabelMap StackMap  -- stackmaps for the continuations
  446       )
  447 
  448 handleLastNode dflags procpoints liveness cont_info stackmaps
  449                stack0@StackMap { sm_sp = sp0 } tscp middle last
  450   = case last of
  451       --  At each return / tail call,
  452       --  adjust Sp to point to the last argument pushed, which
  453       --  is cml_args, after popping any other junk from the stack.
  454       CmmCall{ cml_cont = Nothing, .. } -> do
  455         let sp_off = sp0 - cml_args
  456         return ([], sp_off, last, [], mapEmpty)
  457 
  458       --  At each CmmCall with a continuation:
  459       CmmCall{ cml_cont = Just cont_lbl, .. } ->
  460         return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
  461 
  462       CmmForeignCall{ succ = cont_lbl, .. } ->
  463         return $ lastCall cont_lbl (platformWordSizeInBytes platform) ret_args ret_off
  464               -- one word of args: the return address
  465 
  466       CmmBranch {}     ->  handleBranches
  467       CmmCondBranch {} ->  handleBranches
  468       CmmSwitch {}     ->  handleBranches
  469   where
  470      platform = targetPlatform dflags
  471      -- Calls and ForeignCalls are handled the same way:
  472      lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
  473               -> ( [CmmNode O O]
  474                  , ByteOff
  475                  , CmmNode O C
  476                  , [CmmBlock]
  477                  , LabelMap StackMap
  478                  )
  479      lastCall lbl cml_args cml_ret_args cml_ret_off
  480       =  ( assignments
  481          , spOffsetForCall sp0 cont_stack cml_args
  482          , last
  483          , [] -- no new blocks
  484          , mapSingleton lbl cont_stack )
  485       where
  486          (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
  487 
  488 
  489      prepareStack lbl cml_ret_args cml_ret_off
  490        | Just cont_stack <- mapLookup lbl stackmaps
  491              -- If we have already seen this continuation before, then
  492              -- we just have to make the stack look the same:
  493        = (fixupStack stack0 cont_stack, cont_stack)
  494              -- Otherwise, we have to allocate the stack frame
  495        | otherwise
  496        = (save_assignments, new_cont_stack)
  497        where
  498         (new_cont_stack, save_assignments)
  499            = setupStackFrame platform lbl liveness cml_ret_off cml_ret_args stack0
  500 
  501 
  502      -- For other last nodes (branches), if any of the targets is a
  503      -- proc point, we have to set up the stack to match what the proc
  504      -- point is expecting.
  505      --
  506      handleBranches :: UniqSM ( [CmmNode O O]
  507                                 , ByteOff
  508                                 , CmmNode O C
  509                                 , [CmmBlock]
  510                                 , LabelMap StackMap )
  511 
  512      handleBranches
  513          -- Note [diamond proc point]
  514        | Just l <- futureContinuation middle
  515        , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
  516        = do
  517          let cont_args = mapFindWithDefault 0 l cont_info
  518              (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
  519              out = mapFromList [ (l', cont_stack)
  520                                | l' <- successors last ]
  521          return ( assigs
  522                 , spOffsetForCall sp0 cont_stack (platformWordSizeInBytes platform)
  523                 , last
  524                 , []
  525                 , out)
  526 
  527         | otherwise = do
  528           pps <- mapM handleBranch (successors last)
  529           let lbl_map :: LabelMap Label
  530               lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
  531               fix_lbl l = mapFindWithDefault l l lbl_map
  532           return ( []
  533                  , 0
  534                  , mapSuccessors fix_lbl last
  535                  , concat [ blk | (_,_,_,blk) <- pps ]
  536                  , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
  537 
  538      -- For each successor of this block
  539      handleBranch :: BlockId -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
  540      handleBranch l
  541         --   (a) if the successor already has a stackmap, we need to
  542         --       shuffle the current stack to make it look the same.
  543         --       We have to insert a new block to make this happen.
  544         | Just stack2 <- mapLookup l stackmaps
  545         = do
  546              let assigs = fixupStack stack0 stack2
  547              (tmp_lbl, block) <- makeFixupBlock dflags sp0 l stack2 tscp assigs
  548              return (l, tmp_lbl, stack2, block)
  549 
  550         --   (b) if the successor is a proc point, save everything
  551         --       on the stack.
  552         | l `setMember` procpoints
  553         = do
  554              let cont_args = mapFindWithDefault 0 l cont_info
  555                  (stack2, assigs) =
  556                       setupStackFrame platform l liveness (sm_ret_off stack0)
  557                                                         cont_args stack0
  558              (tmp_lbl, block) <- makeFixupBlock dflags sp0 l stack2 tscp assigs
  559              return (l, tmp_lbl, stack2, block)
  560 
  561         --   (c) otherwise, the current StackMap is the StackMap for
  562         --       the continuation.  But we must remember to remove any
  563         --       variables from the StackMap that are *not* live at
  564         --       the destination, because this StackMap might be used
  565         --       by fixupStack if this is a join point.
  566         | otherwise = return (l, l, stack1, [])
  567         where live = mapFindWithDefault (panic "handleBranch") l liveness
  568               stack1 = stack0 { sm_regs = filterUFM is_live (sm_regs stack0) }
  569               is_live (r,_) = r `elemRegSet` live
  570 
  571 
  572 makeFixupBlock :: DynFlags -> ByteOff -> Label -> StackMap
  573                -> CmmTickScope -> [CmmNode O O]
  574                -> UniqSM (Label, [CmmBlock])
  575 makeFixupBlock dflags sp0 l stack tscope assigs
  576   | null assigs && sp0 == sm_sp stack = return (l, [])
  577   | otherwise = do
  578     tmp_lbl <- newBlockId
  579     let sp_off = sp0 - sm_sp stack
  580         block = blockJoin (CmmEntry tmp_lbl tscope)
  581                           ( maybeAddSpAdj dflags sp0 sp_off
  582                            $ blockFromList assigs )
  583                           (CmmBranch l)
  584     return (tmp_lbl, [block])
  585 
  586 
  587 -- Sp is currently pointing to current_sp,
  588 -- we want it to point to
  589 --    (sm_sp cont_stack - sm_args cont_stack + args)
  590 -- so the difference is
  591 --    sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
  592 spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
  593 spOffsetForCall current_sp cont_stack args
  594   = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
  595 
  596 
  597 -- | create a sequence of assignments to establish the new StackMap,
  598 -- given the old StackMap.
  599 fixupStack :: StackMap -> StackMap -> [CmmNode O O]
  600 fixupStack old_stack new_stack = concatMap move new_locs
  601  where
  602      old_map  = sm_regs old_stack
  603      new_locs = stackSlotRegs new_stack
  604 
  605      move (r,n)
  606        | Just (_,m) <- lookupUFM old_map r, n == m = []
  607        | otherwise = [CmmStore (CmmStackSlot Old n)
  608                                (CmmReg (CmmLocal r))]
  609 
  610 
  611 
  612 setupStackFrame
  613              :: Platform
  614              -> BlockId                 -- label of continuation
  615              -> LabelMap CmmLocalLive   -- liveness
  616              -> ByteOff      -- updfr
  617              -> ByteOff      -- bytes of return values on stack
  618              -> StackMap     -- current StackMap
  619              -> (StackMap, [CmmNode O O])
  620 
  621 setupStackFrame platform lbl liveness updfr_off ret_args stack0
  622   = (cont_stack, assignments)
  623   where
  624       -- get the set of LocalRegs live in the continuation
  625       live = mapFindWithDefault Set.empty lbl liveness
  626 
  627       -- the stack from the base to updfr_off is off-limits.
  628       -- our new stack frame contains:
  629       --   * saved live variables
  630       --   * the return address [young(C) + 8]
  631       --   * the args for the call,
  632       --     which are replaced by the return values at the return
  633       --     point.
  634 
  635       -- everything up to updfr_off is off-limits
  636       -- stack1 contains updfr_off, plus everything we need to save
  637       (stack1, assignments) = allocate platform updfr_off live stack0
  638 
  639       -- And the Sp at the continuation is:
  640       --   sm_sp stack1 + ret_args
  641       cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
  642                          , sm_args = ret_args
  643                          , sm_ret_off = updfr_off
  644                          }
  645 
  646 
  647 -- -----------------------------------------------------------------------------
  648 -- Note [diamond proc point]
  649 --
  650 -- This special case looks for the pattern we get from a typical
  651 -- tagged case expression:
  652 --
  653 --    Sp[young(L1)] = L1
  654 --    if (R1 & 7) != 0 goto L1 else goto L2
  655 --  L2:
  656 --    call [R1] returns to L1
  657 --  L1: live: {y}
  658 --    x = R1
  659 --
  660 -- If we let the generic case handle this, we get
  661 --
  662 --    Sp[-16] = L1
  663 --    if (R1 & 7) != 0 goto L1a else goto L2
  664 --  L2:
  665 --    Sp[-8] = y
  666 --    Sp = Sp - 16
  667 --    call [R1] returns to L1
  668 --  L1a:
  669 --    Sp[-8] = y
  670 --    Sp = Sp - 16
  671 --    goto L1
  672 --  L1:
  673 --    x = R1
  674 --
  675 -- The code for saving the live vars is duplicated in each branch, and
  676 -- furthermore there is an extra jump in the fast path (assuming L1 is
  677 -- a proc point, which it probably is if there is a heap check).
  678 --
  679 -- So to fix this we want to set up the stack frame before the
  680 -- conditional jump.  How do we know when to do this, and when it is
  681 -- safe?  The basic idea is, when we see the assignment
  682 --
  683 --   Sp[young(L)] = L
  684 --
  685 -- we know that
  686 --   * we are definitely heading for L
  687 --   * there can be no more reads from another stack area, because young(L)
  688 --     overlaps with it.
  689 --
  690 -- We don't necessarily know that everything live at L is live now
  691 -- (some might be assigned between here and the jump to L).  So we
  692 -- simplify and only do the optimisation when we see
  693 --
  694 --   (1) a block containing an assignment of a return address L
  695 --   (2) ending in a branch where one (and only) continuation goes to L,
  696 --       and no other continuations go to proc points.
  697 --
  698 -- then we allocate the stack frame for L at the end of the block,
  699 -- before the branch.
  700 --
  701 -- We could generalise (2), but that would make it a bit more
  702 -- complicated to handle, and this currently catches the common case.
  703 
  704 futureContinuation :: Block CmmNode O O -> Maybe BlockId
  705 futureContinuation middle = foldBlockNodesB f middle Nothing
  706    where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
  707          f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
  708                = Just l
  709          f _ r = r
  710 
  711 -- -----------------------------------------------------------------------------
  712 -- Saving live registers
  713 
  714 -- | Given a set of live registers and a StackMap, save all the registers
  715 -- on the stack and return the new StackMap and the assignments to do
  716 -- the saving.
  717 --
  718 allocate :: Platform -> ByteOff -> LocalRegSet -> StackMap
  719          -> (StackMap, [CmmNode O O])
  720 allocate platform ret_off live stackmap@StackMap{ sm_sp = sp0
  721                                               , sm_regs = regs0 }
  722  =
  723    -- we only have to save regs that are not already in a slot
  724    let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
  725        regs1   = filterUFM (\(r,_) -> elemRegSet r live) regs0
  726    in
  727 
  728    -- make a map of the stack
  729    let stack = reverse $ Array.elems $
  730                accumArray (\_ x -> x) Empty (1, toWords platform (max sp0 ret_off)) $
  731                  ret_words ++ live_words
  732             where ret_words =
  733                    [ (x, Occupied)
  734                    | x <- [ 1 .. toWords platform ret_off] ]
  735                   live_words =
  736                    [ (toWords platform x, Occupied)
  737                    | (r,off) <- nonDetEltsUFM regs1,
  738                    -- See Note [Unique Determinism and code generation]
  739                      let w = localRegBytes platform r,
  740                      x <- [ off, off - platformWordSizeInBytes platform .. off - w + 1] ]
  741    in
  742 
  743    -- Pass over the stack: find slots to save all the new live variables,
  744    -- choosing the oldest slots first (hence a foldr).
  745    let
  746        save slot ([], stack, n, assigs, regs) -- no more regs to save
  747           = ([], slot:stack, plusW platform n 1, assigs, regs)
  748        save slot (to_save, stack, n, assigs, regs)
  749           = case slot of
  750                Occupied ->  (to_save, Occupied:stack, plusW platform n 1, assigs, regs)
  751                Empty
  752                  | Just (stack', r, to_save') <-
  753                        select_save to_save (slot:stack)
  754                  -> let assig = CmmStore (CmmStackSlot Old n')
  755                                          (CmmReg (CmmLocal r))
  756                         n' = plusW platform n 1
  757                    in
  758                         (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
  759 
  760                  | otherwise
  761                  -> (to_save, slot:stack, plusW platform n 1, assigs, regs)
  762 
  763        -- we should do better here: right now we'll fit the smallest first,
  764        -- but it would make more sense to fit the biggest first.
  765        select_save :: [LocalReg] -> [StackSlot]
  766                    -> Maybe ([StackSlot], LocalReg, [LocalReg])
  767        select_save regs stack = go regs []
  768          where go []     _no_fit = Nothing
  769                go (r:rs) no_fit
  770                  | Just rest <- dropEmpty words stack
  771                  = Just (replicate words Occupied ++ rest, r, rs++no_fit)
  772                  | otherwise
  773                  = go rs (r:no_fit)
  774                  where words = localRegWords platform r
  775 
  776        -- fill in empty slots as much as possible
  777        (still_to_save, save_stack, n, save_assigs, save_regs)
  778           = foldr save (to_save, [], 0, [], []) stack
  779 
  780        -- push any remaining live vars on the stack
  781        (push_sp, push_assigs, push_regs)
  782           = foldr push (n, [], []) still_to_save
  783           where
  784               push r (n, assigs, regs)
  785                 = (n', assig : assigs, (r,(r,n')) : regs)
  786                 where
  787                   n' = n + localRegBytes platform r
  788                   assig = CmmStore (CmmStackSlot Old n')
  789                                    (CmmReg (CmmLocal r))
  790 
  791        trim_sp
  792           | not (null push_regs) = push_sp
  793           | otherwise
  794           = plusW platform n (- length (takeWhile isEmpty save_stack))
  795 
  796        final_regs = regs1 `addListToUFM` push_regs
  797                           `addListToUFM` save_regs
  798 
  799    in
  800   -- XXX should be an assert
  801    if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
  802 
  803    if (trim_sp .&. (platformWordSizeInBytes platform - 1)) /= 0  then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
  804 
  805    ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
  806    , push_assigs ++ save_assigs )
  807 
  808 
  809 -- -----------------------------------------------------------------------------
  810 -- Manifesting Sp
  811 
  812 -- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp.  The
  813 -- block looks like this:
  814 --
  815 --    middle_pre       -- the middle nodes
  816 --    Sp = Sp + sp_off -- Sp adjustment goes here
  817 --    last             -- the last node
  818 --
  819 -- And we have some extra blocks too (that don't contain Sp adjustments)
  820 --
  821 -- The adjustment for middle_pre will be different from that for
  822 -- middle_post, because the Sp adjustment intervenes.
  823 --
  824 manifestSp
  825    :: DynFlags
  826    -> LabelMap StackMap  -- StackMaps for other blocks
  827    -> StackMap           -- StackMap for this block
  828    -> ByteOff            -- Sp on entry to the block
  829    -> ByteOff            -- SpHigh
  830    -> CmmNode C O        -- first node
  831    -> [CmmNode O O]      -- middle
  832    -> ByteOff            -- sp_off
  833    -> CmmNode O C        -- last node
  834    -> [CmmBlock]         -- new blocks
  835    -> [CmmBlock]         -- final blocks with Sp manifest
  836 
  837 manifestSp dflags stackmaps stack0 sp0 sp_high
  838            first middle_pre sp_off last fixup_blocks
  839   = final_block : fixup_blocks'
  840   where
  841     area_off = getAreaOff stackmaps
  842     platform = targetPlatform dflags
  843 
  844     adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
  845     adj_pre_sp  = mapExpDeep (areaToSp platform sp0            sp_high area_off)
  846     adj_post_sp = mapExpDeep (areaToSp platform (sp0 - sp_off) sp_high area_off)
  847 
  848     final_middle = maybeAddSpAdj dflags sp0 sp_off
  849                  . blockFromList
  850                  . map adj_pre_sp
  851                  . elimStackStores stack0 stackmaps area_off
  852                  $ middle_pre
  853     final_last    = optStackCheck (adj_post_sp last)
  854 
  855     final_block   = blockJoin first final_middle final_last
  856 
  857     fixup_blocks' = map (mapBlock3' (id, adj_post_sp, id)) fixup_blocks
  858 
  859 getAreaOff :: LabelMap StackMap -> (Area -> StackLoc)
  860 getAreaOff _ Old = 0
  861 getAreaOff stackmaps (Young l) =
  862   case mapLookup l stackmaps of
  863     Just sm -> sm_sp sm - sm_args sm
  864     Nothing -> pprPanic "getAreaOff" (ppr l)
  865 
  866 
  867 maybeAddSpAdj
  868   :: DynFlags -> ByteOff -> ByteOff -> Block CmmNode O O -> Block CmmNode O O
  869 maybeAddSpAdj dflags sp0 sp_off block =
  870   add_initial_unwind $ add_adj_unwind $ adj block
  871   where
  872     platform = targetPlatform dflags
  873     adj block
  874       | sp_off /= 0
  875       = block `blockSnoc` CmmAssign spReg (cmmOffset platform spExpr sp_off)
  876       | otherwise = block
  877     -- Add unwind pseudo-instruction at the beginning of each block to
  878     -- document Sp level for debugging
  879     add_initial_unwind block
  880       | debugLevel dflags > 0
  881       = CmmUnwind [(Sp, Just sp_unwind)] `blockCons` block
  882       | otherwise
  883       = block
  884       where sp_unwind = CmmRegOff spReg (sp0 - platformWordSizeInBytes platform)
  885 
  886     -- Add unwind pseudo-instruction right after the Sp adjustment
  887     -- if there is one.
  888     add_adj_unwind block
  889       | debugLevel dflags > 0
  890       , sp_off /= 0
  891       = block `blockSnoc` CmmUnwind [(Sp, Just sp_unwind)]
  892       | otherwise
  893       = block
  894       where sp_unwind = CmmRegOff spReg (sp0 - platformWordSizeInBytes platform - sp_off)
  895 
  896 {- Note [SP old/young offsets]
  897 
  898 Sp(L) is the Sp offset on entry to block L relative to the base of the
  899 OLD area.
  900 
  901 SpArgs(L) is the size of the young area for L, i.e. the number of
  902 arguments.
  903 
  904  - in block L, each reference to [old + N] turns into
  905    [Sp + Sp(L) - N]
  906 
  907  - in block L, each reference to [young(L') + N] turns into
  908    [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
  909 
  910  - be careful with the last node of each block: Sp has already been adjusted
  911    to be Sp + Sp(L) - Sp(L')
  912 -}
  913 
  914 areaToSp :: Platform -> ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
  915 
  916 areaToSp platform sp_old _sp_hwm area_off (CmmStackSlot area n)
  917   = cmmOffset platform spExpr (sp_old - area_off area - n)
  918     -- Replace (CmmStackSlot area n) with an offset from Sp
  919 
  920 areaToSp platform _ sp_hwm _ (CmmLit CmmHighStackMark)
  921   = mkIntExpr platform sp_hwm
  922     -- Replace CmmHighStackMark with the number of bytes of stack used,
  923     -- the sp_hwm.   See Note [Stack usage] in GHC.StgToCmm.Heap
  924 
  925 areaToSp platform _ _ _ (CmmMachOp (MO_U_Lt _) args)
  926   | falseStackCheck args
  927   = zeroExpr platform
  928 areaToSp platform _ _ _ (CmmMachOp (MO_U_Ge _) args)
  929   | falseStackCheck args
  930   = mkIntExpr platform 1
  931     -- Replace a stack-overflow test that cannot fail with a no-op
  932     -- See Note [Always false stack check]
  933 
  934 areaToSp _ _ _ _ other = other
  935 
  936 -- | Determine whether a stack check cannot fail.
  937 falseStackCheck :: [CmmExpr] -> Bool
  938 falseStackCheck [ CmmMachOp (MO_Sub _)
  939                       [ CmmRegOff (CmmGlobal Sp) x_off
  940                       , CmmLit (CmmInt y_lit _)]
  941                 , CmmReg (CmmGlobal SpLim)]
  942   = fromIntegral x_off >= y_lit
  943 falseStackCheck _ = False
  944 
  945 -- Note [Always false stack check]
  946 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  947 -- We can optimise stack checks of the form
  948 --
  949 --   if ((Sp + x) - y < SpLim) then .. else ..
  950 --
  951 -- where are non-negative integer byte offsets.  Since we know that
  952 -- SpLim <= Sp (remember the stack grows downwards), this test must
  953 -- yield False if (x >= y), so we can rewrite the comparison to False.
  954 -- A subsequent sinking pass will later drop the dead code.
  955 -- Optimising this away depends on knowing that SpLim <= Sp, so it is
  956 -- really the job of the stack layout algorithm, hence we do it now.
  957 --
  958 -- The control flow optimiser may negate a conditional to increase
  959 -- the likelihood of a fallthrough if the branch is not taken.  But
  960 -- not every conditional is inverted as the control flow optimiser
  961 -- places some requirements on the predecessors of both branch targets.
  962 -- So we better look for the inverted comparison too.
  963 
  964 optStackCheck :: CmmNode O C -> CmmNode O C
  965 optStackCheck n = -- Note [Always false stack check]
  966  case n of
  967    CmmCondBranch (CmmLit (CmmInt 0 _)) _true false _ -> CmmBranch false
  968    CmmCondBranch (CmmLit (CmmInt _ _)) true _false _ -> CmmBranch true
  969    other -> other
  970 
  971 
  972 -- -----------------------------------------------------------------------------
  973 
  974 -- | Eliminate stores of the form
  975 --
  976 --    Sp[area+n] = r
  977 --
  978 -- when we know that r is already in the same slot as Sp[area+n].  We
  979 -- could do this in a later optimisation pass, but that would involve
  980 -- a separate analysis and we already have the information to hand
  981 -- here.  It helps clean up some extra stack stores in common cases.
  982 --
  983 -- Note that we may have to modify the StackMap as we walk through the
  984 -- code using procMiddle, since an assignment to a variable in the
  985 -- StackMap will invalidate its mapping there.
  986 --
  987 elimStackStores :: StackMap
  988                 -> LabelMap StackMap
  989                 -> (Area -> ByteOff)
  990                 -> [CmmNode O O]
  991                 -> [CmmNode O O]
  992 elimStackStores stackmap stackmaps area_off nodes
  993   = go stackmap nodes
  994   where
  995     go _stackmap [] = []
  996     go stackmap (n:ns)
  997      = case n of
  998          CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
  999             | Just (_,off) <- lookupUFM (sm_regs stackmap) r
 1000             , area_off area + m == off
 1001             -> go stackmap ns
 1002          _otherwise
 1003             -> n : go (procMiddle stackmaps n stackmap) ns
 1004 
 1005 
 1006 -- -----------------------------------------------------------------------------
 1007 -- Update info tables to include stack liveness
 1008 
 1009 
 1010 setInfoTableStackMap :: Platform -> LabelMap StackMap -> CmmDecl -> CmmDecl
 1011 setInfoTableStackMap platform stackmaps (CmmProc top_info@TopInfo{..} l v g)
 1012   = CmmProc top_info{ info_tbls = mapMapWithKey fix_info info_tbls } l v g
 1013   where
 1014     fix_info lbl info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
 1015        info_tbl { cit_rep = StackRep (get_liveness lbl) }
 1016     fix_info _ other = other
 1017 
 1018     get_liveness :: BlockId -> Liveness
 1019     get_liveness lbl
 1020       = case mapLookup lbl stackmaps of
 1021           Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl <+> pdoc platform info_tbls)
 1022           Just sm -> stackMapToLiveness platform sm
 1023 
 1024 setInfoTableStackMap _ _ d = d
 1025 
 1026 
 1027 stackMapToLiveness :: Platform -> StackMap -> Liveness
 1028 stackMapToLiveness platform StackMap{..} =
 1029    reverse $ Array.elems $
 1030         accumArray (\_ x -> x) True (toWords platform sm_ret_off + 1,
 1031                                      toWords platform (sm_sp - sm_args)) live_words
 1032    where
 1033      live_words =  [ (toWords platform off, False)
 1034                    | (r,off) <- nonDetEltsUFM sm_regs
 1035                    , isGcPtrType (localRegType r) ]
 1036                    -- See Note [Unique Determinism and code generation]
 1037 
 1038 -- -----------------------------------------------------------------------------
 1039 -- Pass 2
 1040 -- -----------------------------------------------------------------------------
 1041 
 1042 insertReloadsAsNeeded
 1043     :: Platform
 1044     -> ProcPointSet
 1045     -> LabelMap StackMap
 1046     -> BlockId
 1047     -> [CmmBlock]
 1048     -> UniqSM [CmmBlock]
 1049 insertReloadsAsNeeded platform procpoints final_stackmaps entry blocks =
 1050     toBlockList . fst <$>
 1051         rewriteCmmBwd liveLattice rewriteCC (ofBlockList entry blocks) mapEmpty
 1052   where
 1053     rewriteCC :: RewriteFun CmmLocalLive
 1054     rewriteCC (BlockCC e_node middle0 x_node) fact_base0 = do
 1055         let entry_label = entryLabel e_node
 1056             stackmap = case mapLookup entry_label final_stackmaps of
 1057                 Just sm -> sm
 1058                 Nothing -> panic "insertReloadsAsNeeded: rewriteCC: stackmap"
 1059 
 1060             -- Merge the liveness from successor blocks and analyse the last
 1061             -- node.
 1062             joined = gen_kill platform x_node $!
 1063                          joinOutFacts liveLattice x_node fact_base0
 1064             -- What is live at the start of middle0.
 1065             live_at_middle0 = foldNodesBwdOO (gen_kill platform) middle0 joined
 1066 
 1067             -- If this is a procpoint we need to add the reloads, but only if
 1068             -- they're actually live. Furthermore, nothing is live at the entry
 1069             -- to a proc point.
 1070             (middle1, live_with_reloads)
 1071                 | entry_label `setMember` procpoints
 1072                 = let reloads = insertReloads platform stackmap live_at_middle0
 1073                   in (foldr blockCons middle0 reloads, emptyRegSet)
 1074                 | otherwise
 1075                 = (middle0, live_at_middle0)
 1076 
 1077             -- Final liveness for this block.
 1078             !fact_base2 = mapSingleton entry_label live_with_reloads
 1079 
 1080         return (BlockCC e_node middle1 x_node, fact_base2)
 1081 
 1082 insertReloads :: Platform -> StackMap -> CmmLocalLive -> [CmmNode O O]
 1083 insertReloads platform stackmap live =
 1084      [ CmmAssign (CmmLocal reg)
 1085                  -- This cmmOffset basically corresponds to manifesting
 1086                  -- @CmmStackSlot Old sp_off@, see Note [SP old/young offsets]
 1087                  (CmmLoad (cmmOffset platform spExpr (sp_off - reg_off))
 1088                           (localRegType reg))
 1089      | (reg, reg_off) <- stackSlotRegs stackmap
 1090      , reg `elemRegSet` live
 1091      ]
 1092    where
 1093      sp_off = sm_sp stackmap
 1094 
 1095 -- -----------------------------------------------------------------------------
 1096 -- Lowering safe foreign calls
 1097 
 1098 {-
 1099 Note [Lower safe foreign calls]
 1100 
 1101 We start with
 1102 
 1103    Sp[young(L1)] = L1
 1104  ,-----------------------
 1105  | r1 = foo(x,y,z) returns to L1
 1106  '-----------------------
 1107  L1:
 1108    R1 = r1 -- copyIn, inserted by mkSafeCall
 1109    ...
 1110 
 1111 the stack layout algorithm will arrange to save and reload everything
 1112 live across the call.  Our job now is to expand the call so we get
 1113 
 1114    Sp[young(L1)] = L1
 1115  ,-----------------------
 1116  | SAVE_THREAD_STATE()
 1117  | token = suspendThread(BaseReg, interruptible)
 1118  | r = foo(x,y,z)
 1119  | BaseReg = resumeThread(token)
 1120  | LOAD_THREAD_STATE()
 1121  | R1 = r  -- copyOut
 1122  | jump Sp[0]
 1123  '-----------------------
 1124  L1:
 1125    r = R1 -- copyIn, inserted by mkSafeCall
 1126    ...
 1127 
 1128 Note the copyOut, which saves the results in the places that L1 is
 1129 expecting them (see Note [safe foreign call convention]). Note also
 1130 that safe foreign call is replace by an unsafe one in the Cmm graph.
 1131 -}
 1132 
 1133 lowerSafeForeignCall :: Profile -> CmmBlock -> UniqSM CmmBlock
 1134 lowerSafeForeignCall profile block
 1135   | (entry@(CmmEntry _ tscp), middle, CmmForeignCall { .. }) <- blockSplit block
 1136   = do
 1137     let platform = profilePlatform profile
 1138     -- Both 'id' and 'new_base' are KindNonPtr because they're
 1139     -- RTS-only objects and are not subject to garbage collection
 1140     id <- newTemp (bWord platform)
 1141     new_base <- newTemp (cmmRegType platform baseReg)
 1142     let (caller_save, caller_load) = callerSaveVolatileRegs platform
 1143     save_state_code <- saveThreadState profile
 1144     load_state_code <- loadThreadState profile
 1145     let suspend = save_state_code  <*>
 1146                   caller_save <*>
 1147                   mkMiddle (callSuspendThread platform id intrbl)
 1148         midCall = mkUnsafeCall tgt res args
 1149         resume  = mkMiddle (callResumeThread new_base id) <*>
 1150                   -- Assign the result to BaseReg: we
 1151                   -- might now have a different Capability!
 1152                   mkAssign baseReg (CmmReg (CmmLocal new_base)) <*>
 1153                   caller_load <*>
 1154                   load_state_code
 1155 
 1156         (_, regs, copyout) =
 1157              copyOutOflow profile NativeReturn Jump (Young succ)
 1158                             (map (CmmReg . CmmLocal) res)
 1159                             ret_off []
 1160 
 1161         -- NB. after resumeThread returns, the top-of-stack probably contains
 1162         -- the stack frame for succ, but it might not: if the current thread
 1163         -- received an exception during the call, then the stack might be
 1164         -- different.  Hence we continue by jumping to the top stack frame,
 1165         -- not by jumping to succ.
 1166         jump = CmmCall { cml_target    = entryCode platform $
 1167                                          CmmLoad spExpr (bWord platform)
 1168                        , cml_cont      = Just succ
 1169                        , cml_args_regs = regs
 1170                        , cml_args      = widthInBytes (wordWidth platform)
 1171                        , cml_ret_args  = ret_args
 1172                        , cml_ret_off   = ret_off }
 1173 
 1174     graph' <- lgraphOfAGraph ( suspend <*>
 1175                                midCall <*>
 1176                                resume  <*>
 1177                                copyout <*>
 1178                                mkLast jump, tscp)
 1179 
 1180     case toBlockList graph' of
 1181       [one] -> let (_, middle', last) = blockSplit one
 1182                in return (blockJoin entry (middle `blockAppend` middle') last)
 1183       _ -> panic "lowerSafeForeignCall0"
 1184 
 1185   -- Block doesn't end in a safe foreign call:
 1186   | otherwise = return block
 1187 
 1188 
 1189 callSuspendThread :: Platform -> LocalReg -> Bool -> CmmNode O O
 1190 callSuspendThread platform id intrbl =
 1191   CmmUnsafeForeignCall (PrimTarget MO_SuspendThread)
 1192        [id] [baseExpr, mkIntExpr platform (fromEnum intrbl)]
 1193 
 1194 callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
 1195 callResumeThread new_base id =
 1196   CmmUnsafeForeignCall (PrimTarget MO_ResumeThread)
 1197        [new_base] [CmmReg (CmmLocal id)]
 1198 
 1199 -- -----------------------------------------------------------------------------
 1200 
 1201 plusW :: Platform -> ByteOff -> WordOff -> ByteOff
 1202 plusW platform b w = b + w * platformWordSizeInBytes platform
 1203 
 1204 data StackSlot = Occupied | Empty
 1205      -- Occupied: a return address or part of an update frame
 1206 
 1207 instance Outputable StackSlot where
 1208   ppr Occupied = text "XXX"
 1209   ppr Empty    = text "---"
 1210 
 1211 dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
 1212 dropEmpty 0 ss           = Just ss
 1213 dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
 1214 dropEmpty _ _            = Nothing
 1215 
 1216 isEmpty :: StackSlot -> Bool
 1217 isEmpty Empty = True
 1218 isEmpty _ = False
 1219 
 1220 localRegBytes :: Platform -> LocalReg -> ByteOff
 1221 localRegBytes platform r
 1222     = roundUpToWords platform (widthInBytes (typeWidth (localRegType r)))
 1223 
 1224 localRegWords :: Platform -> LocalReg -> WordOff
 1225 localRegWords platform = toWords platform . localRegBytes platform
 1226 
 1227 toWords :: Platform -> ByteOff -> WordOff
 1228 toWords platform x = x `quot` platformWordSizeInBytes platform
 1229 
 1230 
 1231 stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
 1232 stackSlotRegs sm = nonDetEltsUFM (sm_regs sm)
 1233   -- See Note [Unique Determinism and code generation]