never executed always true always false
    1 {-# LANGUAGE GADTs #-}
    2 {-# LANGUAGE ScopedTypeVariables #-}
    3 
    4 module GHC.Cmm.Sink (
    5      cmmSink
    6   ) where
    7 
    8 import GHC.Prelude
    9 
   10 import GHC.Cmm
   11 import GHC.Cmm.Opt
   12 import GHC.Cmm.Liveness
   13 import GHC.Cmm.LRegSet
   14 import GHC.Cmm.Utils
   15 import GHC.Cmm.Dataflow.Block
   16 import GHC.Cmm.Dataflow.Label
   17 import GHC.Cmm.Dataflow.Collections
   18 import GHC.Cmm.Dataflow.Graph
   19 import GHC.Platform.Regs
   20 
   21 import GHC.Platform
   22 import GHC.Types.Unique.FM
   23 
   24 import qualified Data.IntSet as IntSet
   25 import Data.List (partition)
   26 import Data.Maybe
   27 
   28 import GHC.Exts (inline)
   29 
   30 -- -----------------------------------------------------------------------------
   31 -- Sinking and inlining
   32 
   33 -- This is an optimisation pass that
   34 --  (a) moves assignments closer to their uses, to reduce register pressure
   35 --  (b) pushes assignments into a single branch of a conditional if possible
   36 --  (c) inlines assignments to registers that are mentioned only once
   37 --  (d) discards dead assignments
   38 --
   39 -- This tightens up lots of register-heavy code.  It is particularly
   40 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
   41 -- which every function starts with a copyIn sequence like:
   42 --
   43 --    x1 = R1
   44 --    x2 = Sp[8]
   45 --    x3 = Sp[16]
   46 --    if (Sp - 32 < SpLim) then L1 else L2
   47 --
   48 -- we really want to push the x1..x3 assignments into the L2 branch.
   49 --
   50 -- Algorithm:
   51 --
   52 --  * Start by doing liveness analysis.
   53 --
   54 --  * Keep a list of assignments A; earlier ones may refer to later ones.
   55 --    Currently we only sink assignments to local registers, because we don't
   56 --    have liveness information about global registers.
   57 --
   58 --  * Walk forwards through the graph, look at each node N:
   59 --
   60 --    * If it is a dead assignment, i.e. assignment to a register that is
   61 --      not used after N, discard it.
   62 --
   63 --    * Try to inline based on current list of assignments
   64 --      * If any assignments in A (1) occur only once in N, and (2) are
   65 --        not live after N, inline the assignment and remove it
   66 --        from A.
   67 --
   68 --      * If an assignment in A is cheap (RHS is local register), then
   69 --        inline the assignment and keep it in A in case it is used afterwards.
   70 --
   71 --      * Otherwise don't inline.
   72 --
   73 --    * If N is assignment to a local register pick up the assignment
   74 --      and add it to A.
   75 --
   76 --    * If N is not an assignment to a local register:
   77 --      * remove any assignments from A that conflict with N, and
   78 --        place them before N in the current block.  We call this
   79 --        "dropping" the assignments.
   80 --
   81 --      * An assignment conflicts with N if it:
   82 --        - assigns to a register mentioned in N
   83 --        - mentions a register assigned by N
   84 --        - reads from memory written by N
   85 --      * do this recursively, dropping dependent assignments
   86 --
   87 --    * At an exit node:
   88 --      * drop any assignments that are live on more than one successor
   89 --        and are not trivial
   90 --      * if any successor has more than one predecessor (a join-point),
   91 --        drop everything live in that successor. Since we only propagate
   92 --        assignments that are not dead at the successor, we will therefore
   93 --        eliminate all assignments dead at this point. Thus analysis of a
   94 --        join-point will always begin with an empty list of assignments.
   95 --
   96 --
   97 -- As a result of above algorithm, sinking deletes some dead assignments
   98 -- (transitively, even).  This isn't as good as removeDeadAssignments,
   99 -- but it's much cheaper.
  100 
  101 -- -----------------------------------------------------------------------------
  102 -- things that we aren't optimising very well yet.
  103 --
  104 -- -----------
  105 -- (1) From GHC's FastString.hashStr:
  106 --
  107 --  s2ay:
  108 --      if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
  109 --  c2gn:
  110 --      R1 = _s2au::I64;
  111 --      call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
  112 --  c2gp:
  113 --      _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
  114 --                                 4091);
  115 --      _s2an::I64 = _s2an::I64 + 1;
  116 --      _s2au::I64 = _s2cO::I64;
  117 --      goto s2ay;
  118 --
  119 -- a nice loop, but we didn't eliminate the silly assignment at the end.
  120 -- See Note [dependent assignments], which would probably fix this.
  121 -- This is #8336.
  122 --
  123 -- -----------
  124 -- (2) From stg_atomically_frame in PrimOps.cmm
  125 --
  126 -- We have a diamond control flow:
  127 --
  128 --     x = ...
  129 --       |
  130 --      / \
  131 --     A   B
  132 --      \ /
  133 --       |
  134 --    use of x
  135 --
  136 -- Now x won't be sunk down to its use, because we won't push it into
  137 -- both branches of the conditional.  We certainly do have to check
  138 -- that we can sink it past all the code in both A and B, but having
  139 -- discovered that, we could sink it to its use.
  140 --
  141 
  142 -- -----------------------------------------------------------------------------
  143 
  144 type Assignment = (LocalReg, CmmExpr, AbsMem)
  145   -- Assignment caches AbsMem, an abstraction of the memory read by
  146   -- the RHS of the assignment.
  147 
  148 type Assignments = [Assignment]
  149   -- A sequence of assignments; kept in *reverse* order
  150   -- So the list [ x=e1, y=e2 ] means the sequence of assignments
  151   --     y = e2
  152   --     x = e1
  153 
  154 cmmSink :: Platform -> CmmGraph -> CmmGraph
  155 cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
  156   where
  157   liveness = cmmLocalLivenessL platform graph
  158   getLive l = mapFindWithDefault emptyLRegSet l liveness
  159 
  160   blocks = revPostorder graph
  161 
  162   join_pts = findJoinPoints blocks
  163 
  164   sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
  165   sink _ [] = []
  166   sink sunk (b:bs) =
  167     -- pprTrace "sink" (ppr lbl) $
  168     blockJoin first final_middle final_last : sink sunk' bs
  169     where
  170       lbl = entryLabel b
  171       (first, middle, last) = blockSplit b
  172 
  173       succs = successors last
  174 
  175       -- Annotate the middle nodes with the registers live *after*
  176       -- the node.  This will help us decide whether we can inline
  177       -- an assignment in the current node or not.
  178       live = IntSet.unions (map getLive succs)
  179       live_middle = gen_killL platform last live
  180       ann_middles = annotate platform live_middle (blockToList middle)
  181 
  182       -- Now sink and inline in this block
  183       (middle', assigs) = walk platform ann_middles (mapFindWithDefault [] lbl sunk)
  184       fold_last = constantFoldNode platform last
  185       (final_last, assigs') = tryToInline platform live fold_last assigs
  186 
  187       -- We cannot sink into join points (successors with more than
  188       -- one predecessor), so identify the join points and the set
  189       -- of registers live in them.
  190       (joins, nonjoins) = partition (`mapMember` join_pts) succs
  191       live_in_joins = IntSet.unions (map getLive joins)
  192 
  193       -- We do not want to sink an assignment into multiple branches,
  194       -- so identify the set of registers live in multiple successors.
  195       -- This is made more complicated because when we sink an assignment
  196       -- into one branch, this might change the set of registers that are
  197       -- now live in multiple branches.
  198       init_live_sets = map getLive nonjoins
  199       live_in_multi live_sets r =
  200          case filter (elemLRegSet r) live_sets of
  201            (_one:_two:_) -> True
  202            _ -> False
  203 
  204       -- Now, drop any assignments that we will not sink any further.
  205       (dropped_last, assigs'') = dropAssignments platform drop_if init_live_sets assigs'
  206 
  207       drop_if :: (LocalReg, CmmExpr, AbsMem)
  208                       -> [LRegSet] -> (Bool, [LRegSet])
  209       drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
  210           where
  211             should_drop =  conflicts platform a final_last
  212                         || not (isTrivial platform rhs) && live_in_multi live_sets r
  213                         || r `elemLRegSet` live_in_joins
  214 
  215             live_sets' | should_drop = live_sets
  216                        | otherwise   = map upd live_sets
  217 
  218             upd set | r `elemLRegSet` set = set `IntSet.union` live_rhs
  219                     | otherwise          = set
  220 
  221             live_rhs = foldRegsUsed platform (flip insertLRegSet) emptyLRegSet rhs
  222 
  223       final_middle = foldl' blockSnoc middle' dropped_last
  224 
  225       sunk' = mapUnion sunk $
  226                  mapFromList [ (l, filterAssignments platform (getLive l) assigs'')
  227                              | l <- succs ]
  228 
  229 {- TODO: enable this later, when we have some good tests in place to
  230    measure the effect and tune it.
  231 
  232 -- small: an expression we don't mind duplicating
  233 isSmall :: CmmExpr -> Bool
  234 isSmall (CmmReg (CmmLocal _)) = True  --
  235 isSmall (CmmLit _) = True
  236 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
  237 isSmall (CmmRegOff (CmmLocal _) _) = True
  238 isSmall _ = False
  239 -}
  240 
  241 --
  242 -- We allow duplication of trivial expressions: registers (both local and
  243 -- global) and literals.
  244 --
  245 isTrivial :: Platform -> CmmExpr -> Bool
  246 isTrivial _ (CmmReg (CmmLocal _)) = True
  247 isTrivial platform (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
  248   if isARM (platformArch platform)
  249   then True -- CodeGen.Platform.ARM does not have globalRegMaybe
  250   else isJust (globalRegMaybe platform r)
  251   -- GlobalRegs that are loads from BaseReg are not trivial
  252 isTrivial _ (CmmLit _) = True
  253 isTrivial _ _          = False
  254 
  255 --
  256 -- annotate each node with the set of registers live *after* the node
  257 --
  258 annotate :: Platform -> LRegSet -> [CmmNode O O] -> [(LRegSet, CmmNode O O)]
  259 annotate platform live nodes = snd $ foldr ann (live,[]) nodes
  260   where ann n (live,nodes) = (gen_killL platform n live, (live,n) : nodes)
  261 
  262 --
  263 -- Find the blocks that have multiple successors (join points)
  264 --
  265 findJoinPoints :: [CmmBlock] -> LabelMap Int
  266 findJoinPoints blocks = mapFilter (>1) succ_counts
  267  where
  268   all_succs = concatMap successors blocks
  269 
  270   succ_counts :: LabelMap Int
  271   succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
  272 
  273 --
  274 -- filter the list of assignments to remove any assignments that
  275 -- are not live in a continuation.
  276 --
  277 filterAssignments :: Platform -> LRegSet -> Assignments -> Assignments
  278 filterAssignments platform live assigs = reverse (go assigs [])
  279   where go []             kept = kept
  280         go (a@(r,_,_):as) kept | needed    = go as (a:kept)
  281                                | otherwise = go as kept
  282            where
  283               needed = r `elemLRegSet` live
  284                        || any (conflicts platform a) (map toNode kept)
  285                        --  Note that we must keep assignments that are
  286                        -- referred to by other assignments we have
  287                        -- already kept.
  288 
  289 -- -----------------------------------------------------------------------------
  290 -- Walk through the nodes of a block, sinking and inlining assignments
  291 -- as we go.
  292 --
  293 -- On input we pass in a:
  294 --    * list of nodes in the block
  295 --    * a list of assignments that appeared *before* this block and
  296 --      that are being sunk.
  297 --
  298 -- On output we get:
  299 --    * a new block
  300 --    * a list of assignments that will be placed *after* that block.
  301 --
  302 
  303 walk :: Platform
  304      -> [(LRegSet, CmmNode O O)]    -- nodes of the block, annotated with
  305                                         -- the set of registers live *after*
  306                                         -- this node.
  307 
  308      -> Assignments                     -- The current list of
  309                                         -- assignments we are sinking.
  310                                         -- Earlier assignments may refer
  311                                         -- to later ones.
  312 
  313      -> ( Block CmmNode O O             -- The new block
  314         , Assignments                   -- Assignments to sink further
  315         )
  316 
  317 walk platform nodes assigs = go nodes emptyBlock assigs
  318  where
  319    go []               block as = (block, as)
  320    go ((live,node):ns) block as
  321     | shouldDiscard node live             = go ns block as
  322        -- discard dead assignment
  323     | Just a <- shouldSink platform node2 = go ns block (a : as1)
  324     | otherwise                           = go ns block' as'
  325     where
  326       node1 = constantFoldNode platform node
  327 
  328       (node2, as1) = tryToInline platform live node1 as
  329 
  330       (dropped, as') = dropAssignmentsSimple platform
  331                           (\a -> conflicts platform a node2) as1
  332 
  333       block' = foldl' blockSnoc block dropped `blockSnoc` node2
  334 
  335 
  336 --
  337 -- Heuristic to decide whether to pick up and sink an assignment
  338 -- Currently we pick up all assignments to local registers.  It might
  339 -- be profitable to sink assignments to global regs too, but the
  340 -- liveness analysis doesn't track those (yet) so we can't.
  341 --
  342 shouldSink :: Platform -> CmmNode e x -> Maybe Assignment
  343 shouldSink platform (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem platform e)
  344   where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
  345 shouldSink _ _other = Nothing
  346 
  347 --
  348 -- discard dead assignments.  This doesn't do as good a job as
  349 -- removeDeadAssignments, because it would need multiple passes
  350 -- to get all the dead code, but it catches the common case of
  351 -- superfluous reloads from the stack that the stack allocator
  352 -- leaves behind.
  353 --
  354 -- Also we catch "r = r" here.  You might think it would fall
  355 -- out of inlining, but the inliner will see that r is live
  356 -- after the instruction and choose not to inline r in the rhs.
  357 --
  358 shouldDiscard :: CmmNode e x -> LRegSet -> Bool
  359 shouldDiscard node live
  360    = case node of
  361        CmmAssign r (CmmReg r') | r == r' -> True
  362        CmmAssign (CmmLocal r) _ -> not (r `elemLRegSet` live)
  363        _otherwise -> False
  364 
  365 
  366 toNode :: Assignment -> CmmNode O O
  367 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
  368 
  369 dropAssignmentsSimple :: Platform -> (Assignment -> Bool) -> Assignments
  370                       -> ([CmmNode O O], Assignments)
  371 dropAssignmentsSimple platform f = dropAssignments platform (\a _ -> (f a, ())) ()
  372 
  373 dropAssignments :: Platform -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
  374                 -> ([CmmNode O O], Assignments)
  375 dropAssignments platform should_drop state assigs
  376  = (dropped, reverse kept)
  377  where
  378    (dropped,kept) = go state assigs [] []
  379 
  380    go _ []             dropped kept = (dropped, kept)
  381    go state (assig : rest) dropped kept
  382       | conflict  = go state' rest (toNode assig : dropped) kept
  383       | otherwise = go state' rest dropped (assig:kept)
  384       where
  385         (dropit, state') = should_drop assig state
  386         conflict = dropit || any (conflicts platform assig) dropped
  387 
  388 
  389 -- -----------------------------------------------------------------------------
  390 -- Try to inline assignments into a node.
  391 -- This also does constant folding for primpops, since
  392 -- inlining opens up opportunities for doing so.
  393 
  394 tryToInline
  395    :: forall x. Platform
  396    -> LRegSet               -- set of registers live after this
  397   --  -> LocalRegSet               -- set of registers live after this
  398                                 -- node.  We cannot inline anything
  399                                 -- that is live after the node, unless
  400                                 -- it is small enough to duplicate.
  401    -> CmmNode O x               -- The node to inline into
  402    -> Assignments               -- Assignments to inline
  403    -> (
  404         CmmNode O x             -- New node
  405       , Assignments             -- Remaining assignments
  406       )
  407 
  408 tryToInline platform liveAfter node assigs =
  409   -- pprTrace "tryToInline assig length:" (ppr $ length assigs) $
  410     go usages liveAfter node emptyLRegSet assigs
  411  where
  412   usages :: UniqFM LocalReg Int -- Maps each LocalReg to a count of how often it is used
  413   usages = foldLocalRegsUsed platform addUsage emptyUFM node
  414 
  415   go :: UniqFM LocalReg Int -> LRegSet -> CmmNode O x -> LRegSet -> Assignments
  416      -> (CmmNode O x, Assignments)
  417   go _usages _live node _skipped [] = (node, [])
  418 
  419   go usages live node skipped (a@(l,rhs,_) : rest)
  420    | cannot_inline            = dont_inline
  421    | occurs_none              = discard  -- Note [discard during inlining]
  422    | occurs_once              = inline_and_discard
  423    | isTrivial platform rhs   = inline_and_keep
  424    | otherwise                = dont_inline
  425    where
  426         inline_and_discard = go usages' live inl_node skipped rest
  427           where usages' = foldLocalRegsUsed platform addUsage usages rhs
  428 
  429         discard = go usages live node skipped rest
  430 
  431         dont_inline        = keep node  -- don't inline the assignment, keep it
  432         inline_and_keep    = keep inl_node -- inline the assignment, keep it
  433 
  434         keep :: CmmNode O x -> (CmmNode O x, Assignments)
  435         keep node' = (final_node, a : rest')
  436           where (final_node, rest') = go usages live' node' (insertLRegSet l skipped) rest
  437 
  438                 -- Avoid discarding of assignments to vars on the rhs.
  439                 -- See Note [Keeping assignemnts mentioned in skipped RHSs]
  440                 -- usages' = foldLocalRegsUsed platform (\m r -> addToUFM m r 2)
  441                                             -- usages rhs
  442                 live' = inline foldLocalRegsUsed platform (\m r -> insertLRegSet r m)
  443                                             live rhs
  444 
  445         cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
  446                         || l `elemLRegSet` skipped
  447                         || not (okToInline platform rhs node)
  448 
  449         -- How often is l used in the current node.
  450         l_usages = lookupUFM usages l
  451         l_live   = l `elemLRegSet` live
  452 
  453         occurs_once = not l_live && l_usages == Just 1
  454         occurs_none = not l_live && l_usages == Nothing
  455 
  456         inl_node = improveConditional (mapExpDeep inl_exp node)
  457 
  458         inl_exp :: CmmExpr -> CmmExpr
  459         -- inl_exp is where the inlining actually takes place!
  460         inl_exp (CmmReg    (CmmLocal l'))     | l == l' = rhs
  461         inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
  462                     = cmmOffset platform rhs off
  463                     -- re-constant fold after inlining
  464         inl_exp (CmmMachOp op args) = cmmMachOpFold platform op args
  465         inl_exp other = other
  466 
  467 {- Note [Keeping assignemnts mentioned in skipped RHSs]
  468     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  469 
  470     If we have to assignments: [z = y, y = e1] and we skip
  471     z we *must* retain the assignment y = e1. This is because
  472     we might inline "z = y" into another node later on so we
  473     must ensure y is still defined at this point.
  474 
  475     If we dropped the assignment of "y = e1" then we would end up
  476     referencing a variable which hasn't been mentioned after
  477     inlining.
  478 
  479     We use a hack to do this.
  480 
  481     We pretend the regs from the rhs are live after the current
  482     node. Since we only discard assignments to variables
  483     which are dead after the current block this prevents discarding of the
  484     assignment. It still allows inlining should e1 be a trivial rhs
  485     however.
  486 
  487 -}
  488 
  489 {- Note [improveConditional]
  490 
  491 cmmMachOpFold tries to simplify conditionals to turn things like
  492   (a == b) != 1
  493 into
  494   (a != b)
  495 but there's one case it can't handle: when the comparison is over
  496 floating-point values, we can't invert it, because floating-point
  497 comparisons aren't invertible (because of NaNs).
  498 
  499 But we *can* optimise this conditional by swapping the true and false
  500 branches. Given
  501   CmmCondBranch ((a >## b) != 1) t f
  502 we can turn it into
  503   CmmCondBranch (a >## b) f t
  504 
  505 So here we catch conditionals that weren't optimised by cmmMachOpFold,
  506 and apply above transformation to eliminate the comparison against 1.
  507 
  508 It's tempting to just turn every != into == and then let cmmMachOpFold
  509 do its thing, but that risks changing a nice fall-through conditional
  510 into one that requires two jumps. (see swapcond_last in
  511 GHC.Cmm.ContFlowOpt), so instead we carefully look for just the cases where
  512 we can eliminate a comparison.
  513 -}
  514 improveConditional :: CmmNode O x -> CmmNode O x
  515 improveConditional
  516   (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l)
  517   | neLike mop, isComparisonExpr x
  518   = CmmCondBranch x f t (fmap not l)
  519   where
  520     neLike (MO_Ne _) = True
  521     neLike (MO_U_Lt _) = True   -- (x<y) < 1 behaves like (x<y) != 1
  522     neLike (MO_S_Lt _) = True   -- (x<y) < 1 behaves like (x<y) != 1
  523     neLike _ = False
  524 improveConditional other = other
  525 
  526 -- Note [dependent assignments]
  527 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  528 --
  529 -- If our assignment list looks like
  530 --
  531 --    [ y = e,  x = ... y ... ]
  532 --
  533 -- We cannot inline x.  Remember this list is really in reverse order,
  534 -- so it means  x = ... y ...; y = e
  535 --
  536 -- Hence if we inline x, the outer assignment to y will capture the
  537 -- reference in x's right hand side.
  538 --
  539 -- In this case we should rename the y in x's right-hand side,
  540 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
  541 -- Now we can go ahead and inline x.
  542 --
  543 -- For now we do nothing, because this would require putting
  544 -- everything inside UniqSM.
  545 --
  546 -- One more variant of this (#7366):
  547 --
  548 --   [ y = e, y = z ]
  549 --
  550 -- If we don't want to inline y = e, because y is used many times, we
  551 -- might still be tempted to inline y = z (because we always inline
  552 -- trivial rhs's).  But of course we can't, because y is equal to e,
  553 -- not z.
  554 
  555 -- Note [discard during inlining]
  556 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  557 -- Opportunities to discard assignments sometimes appear after we've
  558 -- done some inlining.  Here's an example:
  559 --
  560 --      x = R1;
  561 --      y = P64[x + 7];
  562 --      z = P64[x + 15];
  563 --      /* z is dead */
  564 --      R1 = y & (-8);
  565 --
  566 -- The x assignment is trivial, so we inline it in the RHS of y, and
  567 -- keep both x and y.  z gets dropped because it is dead, then we
  568 -- inline y, and we have a dead assignment to x.  If we don't notice
  569 -- that x is dead in tryToInline, we end up retaining it.
  570 
  571 addUsage :: UniqFM LocalReg Int -> LocalReg -> UniqFM LocalReg Int
  572 addUsage m r = addToUFM_C (+) m r 1
  573 
  574 regsUsedIn :: LRegSet -> CmmExpr -> Bool
  575 regsUsedIn ls _ | nullLRegSet ls = False
  576 regsUsedIn ls e = go ls e False
  577   where use :: LRegSet -> CmmExpr -> Bool -> Bool
  578         use ls (CmmReg (CmmLocal l))      _ | l `elemLRegSet` ls = True
  579         use ls (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True
  580         use _ls _ z = z
  581 
  582         go :: LRegSet -> CmmExpr -> Bool -> Bool
  583         go ls (CmmMachOp _ es) z = foldr (go ls) z es
  584         go ls (CmmLoad addr _) z = go ls addr z
  585         go ls e                z = use ls e z
  586 
  587 -- we don't inline into CmmUnsafeForeignCall if the expression refers
  588 -- to global registers.  This is a HACK to avoid global registers
  589 -- clashing with C argument-passing registers, really the back-end
  590 -- ought to be able to handle it properly, but currently neither PprC
  591 -- nor the NCG can do it.  See Note [Register parameter passing]
  592 -- See also GHC.StgToCmm.Foreign.load_args_into_temps.
  593 okToInline :: Platform -> CmmExpr -> CmmNode e x -> Bool
  594 okToInline platform expr node@(CmmUnsafeForeignCall{}) =
  595     not (globalRegistersConflict platform expr node)
  596 okToInline _ _ _ = True
  597 
  598 -- -----------------------------------------------------------------------------
  599 
  600 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
  601 -- @r = e@ can be safely commuted past statement @node@.
  602 conflicts :: Platform -> Assignment -> CmmNode O x -> Bool
  603 conflicts platform (r, rhs, addr) node
  604 
  605   -- (1) node defines registers used by rhs of assignment. This catches
  606   -- assignments and all three kinds of calls. See Note [Sinking and calls]
  607   | globalRegistersConflict platform rhs node                       = True
  608   | localRegistersConflict  platform rhs node                       = True
  609 
  610   -- (2) node uses register defined by assignment
  611   | foldRegsUsed platform (\b r' -> r == r' || b) False node        = True
  612 
  613   -- (3) a store to an address conflicts with a read of the same memory
  614   | CmmStore addr' e <- node
  615   , memConflicts addr (loadAddr platform addr' (cmmExprWidth platform e)) = True
  616 
  617   -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
  618   | HeapMem    <- addr, CmmAssign (CmmGlobal Hp) _ <- node        = True
  619   | StackMem   <- addr, CmmAssign (CmmGlobal Sp) _ <- node        = True
  620   | SpMem{}    <- addr, CmmAssign (CmmGlobal Sp) _ <- node        = True
  621 
  622   -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
  623   | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem      = True
  624 
  625   -- (6) suspendThread clobbers every global register not backed by a real
  626   -- register. It also clobbers heap and stack but this is handled by (5)
  627   | CmmUnsafeForeignCall (PrimTarget MO_SuspendThread) _ _ <- node
  628   , foldRegsUsed platform (\b g -> globalRegMaybe platform g == Nothing || b) False rhs
  629   = True
  630 
  631   -- (7) native calls clobber any memory
  632   | CmmCall{} <- node, memConflicts addr AnyMem                   = True
  633 
  634   -- (8) otherwise, no conflict
  635   | otherwise = False
  636 
  637 {- Note [Inlining foldRegsDefd]
  638    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  639 
  640    foldRegsDefd is, after optimization, *not* a small function so
  641    it's only marked INLINEABLE, but not INLINE.
  642 
  643    However in some specific cases we call it *very* often making it
  644    important to avoid the overhead of allocating the folding function.
  645 
  646    So we simply force inlining via the magic inline function.
  647    For T3294 this improves allocation with -O by ~1%.
  648 
  649 -}
  650 
  651 -- Returns True if node defines any global registers that are used in the
  652 -- Cmm expression
  653 globalRegistersConflict :: Platform -> CmmExpr -> CmmNode e x -> Bool
  654 globalRegistersConflict platform expr node =
  655     -- See Note [Inlining foldRegsDefd]
  656     inline foldRegsDefd platform (\b r -> b || regUsedIn platform (CmmGlobal r) expr)
  657                  False node
  658 
  659 -- Returns True if node defines any local registers that are used in the
  660 -- Cmm expression
  661 localRegistersConflict :: Platform -> CmmExpr -> CmmNode e x -> Bool
  662 localRegistersConflict platform expr node =
  663     -- See Note [Inlining foldRegsDefd]
  664     inline foldRegsDefd platform (\b r -> b || regUsedIn platform (CmmLocal  r) expr)
  665                  False node
  666 
  667 -- Note [Sinking and calls]
  668 -- ~~~~~~~~~~~~~~~~~~~~~~~~
  669 --
  670 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
  671 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
  672 -- stack layout (see Note [Sinking after stack layout]) which leads to two
  673 -- invariants related to calls:
  674 --
  675 --   a) during stack layout phase all safe foreign calls are turned into
  676 --      unsafe foreign calls (see Note [Lower safe foreign calls]). This
  677 --      means that we will never encounter CmmForeignCall node when running
  678 --      sinking after stack layout
  679 --
  680 --   b) stack layout saves all variables live across a call on the stack
  681 --      just before making a call (remember we are not sinking assignments to
  682 --      stack):
  683 --
  684 --       L1:
  685 --          x = R1
  686 --          P64[Sp - 16] = L2
  687 --          P64[Sp - 8]  = x
  688 --          Sp = Sp - 16
  689 --          call f() returns L2
  690 --       L2:
  691 --
  692 --      We will attempt to sink { x = R1 } but we will detect conflict with
  693 --      { P64[Sp - 8]  = x } and hence we will drop { x = R1 } without even
  694 --      checking whether it conflicts with { call f() }. In this way we will
  695 --      never need to check any assignment conflicts with CmmCall. Remember
  696 --      that we still need to check for potential memory conflicts.
  697 --
  698 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
  699 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
  700 -- This assumption holds only when we do sinking after stack layout. If we run
  701 -- it before stack layout we need to check for possible conflicts with all three
  702 -- kinds of calls. Our `conflicts` function does that by using a generic
  703 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
  704 -- UserOfRegs typeclasses.
  705 --
  706 
  707 -- An abstraction of memory read or written.
  708 data AbsMem
  709   = NoMem            -- no memory accessed
  710   | AnyMem           -- arbitrary memory
  711   | HeapMem          -- definitely heap memory
  712   | StackMem         -- definitely stack memory
  713   | SpMem            -- <size>[Sp+n]
  714        {-# UNPACK #-} !Int
  715        {-# UNPACK #-} !Int
  716 
  717 -- Having SpMem is important because it lets us float loads from Sp
  718 -- past stores to Sp as long as they don't overlap, and this helps to
  719 -- unravel some long sequences of
  720 --    x1 = [Sp + 8]
  721 --    x2 = [Sp + 16]
  722 --    ...
  723 --    [Sp + 8]  = xi
  724 --    [Sp + 16] = xj
  725 --
  726 -- Note that SpMem is invalidated if Sp is changed, but the definition
  727 -- of 'conflicts' above handles that.
  728 
  729 -- ToDo: this won't currently fix the following commonly occurring code:
  730 --    x1 = [R1 + 8]
  731 --    x2 = [R1 + 16]
  732 --    ..
  733 --    [Hp - 8] = x1
  734 --    [Hp - 16] = x2
  735 --    ..
  736 
  737 -- because [R1 + 8] and [Hp - 8] are both HeapMem.  We know that
  738 -- assignments to [Hp + n] do not conflict with any other heap memory,
  739 -- but this is tricky to nail down.  What if we had
  740 --
  741 --   x = Hp + n
  742 --   [x] = ...
  743 --
  744 --  the store to [x] should be "new heap", not "old heap".
  745 --  Furthermore, you could imagine that if we started inlining
  746 --  functions in Cmm then there might well be reads of heap memory
  747 --  that was written in the same basic block.  To take advantage of
  748 --  non-aliasing of heap memory we will have to be more clever.
  749 
  750 -- Note [Foreign calls clobber heap]
  751 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  752 --
  753 -- It is tempting to say that foreign calls clobber only
  754 -- non-heap/stack memory, but unfortunately we break this invariant in
  755 -- the RTS.  For example, in stg_catch_retry_frame we call
  756 -- stmCommitNestedTransaction() which modifies the contents of the
  757 -- TRec it is passed (this actually caused incorrect code to be
  758 -- generated).
  759 --
  760 -- Since the invariant is true for the majority of foreign calls,
  761 -- perhaps we ought to have a special annotation for calls that can
  762 -- modify heap/stack memory.  For now we just use the conservative
  763 -- definition here.
  764 --
  765 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
  766 -- therefore we should never float any memory operations across one of
  767 -- these calls.
  768 --
  769 -- `suspendThread` releases the capability used by the thread, hence we mustn't
  770 -- float accesses to heap, stack or virtual global registers stored in the
  771 -- capability (e.g. with unregisterised build, see #19237).
  772 
  773 
  774 bothMems :: AbsMem -> AbsMem -> AbsMem
  775 bothMems NoMem    x         = x
  776 bothMems x        NoMem     = x
  777 bothMems HeapMem  HeapMem   = HeapMem
  778 bothMems StackMem StackMem     = StackMem
  779 bothMems (SpMem o1 w1) (SpMem o2 w2)
  780   | o1 == o2  = SpMem o1 (max w1 w2)
  781   | otherwise = StackMem
  782 bothMems SpMem{}  StackMem  = StackMem
  783 bothMems StackMem SpMem{}   = StackMem
  784 bothMems _         _        = AnyMem
  785 
  786 memConflicts :: AbsMem -> AbsMem -> Bool
  787 memConflicts NoMem      _          = False
  788 memConflicts _          NoMem      = False
  789 memConflicts HeapMem    StackMem   = False
  790 memConflicts StackMem   HeapMem    = False
  791 memConflicts SpMem{}    HeapMem    = False
  792 memConflicts HeapMem    SpMem{}    = False
  793 memConflicts (SpMem o1 w1) (SpMem o2 w2)
  794   | o1 < o2   = o1 + w1 > o2
  795   | otherwise = o2 + w2 > o1
  796 memConflicts _         _         = True
  797 
  798 exprMem :: Platform -> CmmExpr -> AbsMem
  799 exprMem platform (CmmLoad addr w)  = bothMems (loadAddr platform addr (typeWidth w)) (exprMem platform addr)
  800 exprMem platform (CmmMachOp _ es)  = foldr bothMems NoMem (map (exprMem platform) es)
  801 exprMem _        _                 = NoMem
  802 
  803 loadAddr :: Platform -> CmmExpr -> Width -> AbsMem
  804 loadAddr platform e w =
  805   case e of
  806    CmmReg r       -> regAddr platform r 0 w
  807    CmmRegOff r i  -> regAddr platform r i w
  808    _other | regUsedIn platform spReg e -> StackMem
  809           | otherwise                  -> AnyMem
  810 
  811 regAddr :: Platform -> CmmReg -> Int -> Width -> AbsMem
  812 regAddr _      (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
  813 regAddr _      (CmmGlobal Hp) _ _ = HeapMem
  814 regAddr _      (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
  815 regAddr platform r _ _ | isGcPtrType (cmmRegType platform r) = HeapMem -- yay! GCPtr pays for itself
  816 regAddr _      _ _ _ = AnyMem
  817 
  818 {-
  819 Note [Inline GlobalRegs?]
  820 ~~~~~~~~~~~~~~~~~~~~~~~~~
  821 
  822 Should we freely inline GlobalRegs?
  823 
  824 Actually it doesn't make a huge amount of difference either way, so we
  825 *do* currently treat GlobalRegs as "trivial" and inline them
  826 everywhere, but for what it's worth, here is what I discovered when I
  827 (SimonM) looked into this:
  828 
  829 Common sense says we should not inline GlobalRegs, because when we
  830 have
  831 
  832   x = R1
  833 
  834 the register allocator will coalesce this assignment, generating no
  835 code, and simply record the fact that x is bound to $rbx (or
  836 whatever).  Furthermore, if we were to sink this assignment, then the
  837 range of code over which R1 is live increases, and the range of code
  838 over which x is live decreases.  All things being equal, it is better
  839 for x to be live than R1, because R1 is a fixed register whereas x can
  840 live in any register.  So we should neither sink nor inline 'x = R1'.
  841 
  842 However, not inlining GlobalRegs can have surprising
  843 consequences. e.g. (cgrun020)
  844 
  845   c3EN:
  846       _s3DB::P64 = R1;
  847       _c3ES::P64 = _s3DB::P64 & 7;
  848       if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
  849   c3EU:
  850       _s3DD::P64 = P64[_s3DB::P64 + 6];
  851       _s3DE::P64 = P64[_s3DB::P64 + 14];
  852       I64[Sp - 8] = c3F0;
  853       R1 = _s3DE::P64;
  854       P64[Sp] = _s3DD::P64;
  855 
  856 inlining the GlobalReg gives:
  857 
  858   c3EN:
  859       if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
  860   c3EU:
  861       I64[Sp - 8] = c3F0;
  862       _s3DD::P64 = P64[R1 + 6];
  863       R1 = P64[R1 + 14];
  864       P64[Sp] = _s3DD::P64;
  865 
  866 but if we don't inline the GlobalReg, instead we get:
  867 
  868       _s3DB::P64 = R1;
  869       if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
  870   c3EU:
  871       I64[Sp - 8] = c3F0;
  872       R1 = P64[_s3DB::P64 + 14];
  873       P64[Sp] = P64[_s3DB::P64 + 6];
  874 
  875 This looks better - we managed to inline _s3DD - but in fact it
  876 generates an extra reg-reg move:
  877 
  878 .Lc3EU:
  879         movq $c3F0_info,-8(%rbp)
  880         movq %rbx,%rax
  881         movq 14(%rbx),%rbx
  882         movq 6(%rax),%rax
  883         movq %rax,(%rbp)
  884 
  885 because _s3DB is now live across the R1 assignment, we lost the
  886 benefit of coalescing.
  887 
  888 Who is at fault here?  Perhaps if we knew that _s3DB was an alias for
  889 R1, then we would not sink a reference to _s3DB past the R1
  890 assignment.  Or perhaps we *should* do that - we might gain by sinking
  891 it, despite losing the coalescing opportunity.
  892 
  893 Sometimes not inlining global registers wins by virtue of the rule
  894 about not inlining into arguments of a foreign call, e.g. (T7163) this
  895 is what happens when we inlined F1:
  896 
  897       _s3L2::F32 = F1;
  898       _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
  899       (_s3L7::F32) = call "ccall" arg hints:  []  result hints:  [] rintFloat(_c3O3::F32);
  900 
  901 but if we don't inline F1:
  902 
  903       (_s3L7::F32) = call "ccall" arg hints:  []  result hints:  [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
  904                                                                                             10.0 :: W32));
  905 -}