never executed always true always false
1 {-# LANGUAGE GADTs #-}
2 {-# LANGUAGE ScopedTypeVariables #-}
3
4 module GHC.Cmm.Sink (
5 cmmSink
6 ) where
7
8 import GHC.Prelude
9
10 import GHC.Cmm
11 import GHC.Cmm.Opt
12 import GHC.Cmm.Liveness
13 import GHC.Cmm.LRegSet
14 import GHC.Cmm.Utils
15 import GHC.Cmm.Dataflow.Block
16 import GHC.Cmm.Dataflow.Label
17 import GHC.Cmm.Dataflow.Collections
18 import GHC.Cmm.Dataflow.Graph
19 import GHC.Platform.Regs
20
21 import GHC.Platform
22 import GHC.Types.Unique.FM
23
24 import qualified Data.IntSet as IntSet
25 import Data.List (partition)
26 import Data.Maybe
27
28 import GHC.Exts (inline)
29
30 -- -----------------------------------------------------------------------------
31 -- Sinking and inlining
32
33 -- This is an optimisation pass that
34 -- (a) moves assignments closer to their uses, to reduce register pressure
35 -- (b) pushes assignments into a single branch of a conditional if possible
36 -- (c) inlines assignments to registers that are mentioned only once
37 -- (d) discards dead assignments
38 --
39 -- This tightens up lots of register-heavy code. It is particularly
40 -- helpful in the Cmm generated by the Stg->Cmm code generator, in
41 -- which every function starts with a copyIn sequence like:
42 --
43 -- x1 = R1
44 -- x2 = Sp[8]
45 -- x3 = Sp[16]
46 -- if (Sp - 32 < SpLim) then L1 else L2
47 --
48 -- we really want to push the x1..x3 assignments into the L2 branch.
49 --
50 -- Algorithm:
51 --
52 -- * Start by doing liveness analysis.
53 --
54 -- * Keep a list of assignments A; earlier ones may refer to later ones.
55 -- Currently we only sink assignments to local registers, because we don't
56 -- have liveness information about global registers.
57 --
58 -- * Walk forwards through the graph, look at each node N:
59 --
60 -- * If it is a dead assignment, i.e. assignment to a register that is
61 -- not used after N, discard it.
62 --
63 -- * Try to inline based on current list of assignments
64 -- * If any assignments in A (1) occur only once in N, and (2) are
65 -- not live after N, inline the assignment and remove it
66 -- from A.
67 --
68 -- * If an assignment in A is cheap (RHS is local register), then
69 -- inline the assignment and keep it in A in case it is used afterwards.
70 --
71 -- * Otherwise don't inline.
72 --
73 -- * If N is assignment to a local register pick up the assignment
74 -- and add it to A.
75 --
76 -- * If N is not an assignment to a local register:
77 -- * remove any assignments from A that conflict with N, and
78 -- place them before N in the current block. We call this
79 -- "dropping" the assignments.
80 --
81 -- * An assignment conflicts with N if it:
82 -- - assigns to a register mentioned in N
83 -- - mentions a register assigned by N
84 -- - reads from memory written by N
85 -- * do this recursively, dropping dependent assignments
86 --
87 -- * At an exit node:
88 -- * drop any assignments that are live on more than one successor
89 -- and are not trivial
90 -- * if any successor has more than one predecessor (a join-point),
91 -- drop everything live in that successor. Since we only propagate
92 -- assignments that are not dead at the successor, we will therefore
93 -- eliminate all assignments dead at this point. Thus analysis of a
94 -- join-point will always begin with an empty list of assignments.
95 --
96 --
97 -- As a result of above algorithm, sinking deletes some dead assignments
98 -- (transitively, even). This isn't as good as removeDeadAssignments,
99 -- but it's much cheaper.
100
101 -- -----------------------------------------------------------------------------
102 -- things that we aren't optimising very well yet.
103 --
104 -- -----------
105 -- (1) From GHC's FastString.hashStr:
106 --
107 -- s2ay:
108 -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
109 -- c2gn:
110 -- R1 = _s2au::I64;
111 -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
112 -- c2gp:
113 -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
114 -- 4091);
115 -- _s2an::I64 = _s2an::I64 + 1;
116 -- _s2au::I64 = _s2cO::I64;
117 -- goto s2ay;
118 --
119 -- a nice loop, but we didn't eliminate the silly assignment at the end.
120 -- See Note [dependent assignments], which would probably fix this.
121 -- This is #8336.
122 --
123 -- -----------
124 -- (2) From stg_atomically_frame in PrimOps.cmm
125 --
126 -- We have a diamond control flow:
127 --
128 -- x = ...
129 -- |
130 -- / \
131 -- A B
132 -- \ /
133 -- |
134 -- use of x
135 --
136 -- Now x won't be sunk down to its use, because we won't push it into
137 -- both branches of the conditional. We certainly do have to check
138 -- that we can sink it past all the code in both A and B, but having
139 -- discovered that, we could sink it to its use.
140 --
141
142 -- -----------------------------------------------------------------------------
143
144 type Assignment = (LocalReg, CmmExpr, AbsMem)
145 -- Assignment caches AbsMem, an abstraction of the memory read by
146 -- the RHS of the assignment.
147
148 type Assignments = [Assignment]
149 -- A sequence of assignments; kept in *reverse* order
150 -- So the list [ x=e1, y=e2 ] means the sequence of assignments
151 -- y = e2
152 -- x = e1
153
154 cmmSink :: Platform -> CmmGraph -> CmmGraph
155 cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
156 where
157 liveness = cmmLocalLivenessL platform graph
158 getLive l = mapFindWithDefault emptyLRegSet l liveness
159
160 blocks = revPostorder graph
161
162 join_pts = findJoinPoints blocks
163
164 sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
165 sink _ [] = []
166 sink sunk (b:bs) =
167 -- pprTrace "sink" (ppr lbl) $
168 blockJoin first final_middle final_last : sink sunk' bs
169 where
170 lbl = entryLabel b
171 (first, middle, last) = blockSplit b
172
173 succs = successors last
174
175 -- Annotate the middle nodes with the registers live *after*
176 -- the node. This will help us decide whether we can inline
177 -- an assignment in the current node or not.
178 live = IntSet.unions (map getLive succs)
179 live_middle = gen_killL platform last live
180 ann_middles = annotate platform live_middle (blockToList middle)
181
182 -- Now sink and inline in this block
183 (middle', assigs) = walk platform ann_middles (mapFindWithDefault [] lbl sunk)
184 fold_last = constantFoldNode platform last
185 (final_last, assigs') = tryToInline platform live fold_last assigs
186
187 -- We cannot sink into join points (successors with more than
188 -- one predecessor), so identify the join points and the set
189 -- of registers live in them.
190 (joins, nonjoins) = partition (`mapMember` join_pts) succs
191 live_in_joins = IntSet.unions (map getLive joins)
192
193 -- We do not want to sink an assignment into multiple branches,
194 -- so identify the set of registers live in multiple successors.
195 -- This is made more complicated because when we sink an assignment
196 -- into one branch, this might change the set of registers that are
197 -- now live in multiple branches.
198 init_live_sets = map getLive nonjoins
199 live_in_multi live_sets r =
200 case filter (elemLRegSet r) live_sets of
201 (_one:_two:_) -> True
202 _ -> False
203
204 -- Now, drop any assignments that we will not sink any further.
205 (dropped_last, assigs'') = dropAssignments platform drop_if init_live_sets assigs'
206
207 drop_if :: (LocalReg, CmmExpr, AbsMem)
208 -> [LRegSet] -> (Bool, [LRegSet])
209 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
210 where
211 should_drop = conflicts platform a final_last
212 || not (isTrivial platform rhs) && live_in_multi live_sets r
213 || r `elemLRegSet` live_in_joins
214
215 live_sets' | should_drop = live_sets
216 | otherwise = map upd live_sets
217
218 upd set | r `elemLRegSet` set = set `IntSet.union` live_rhs
219 | otherwise = set
220
221 live_rhs = foldRegsUsed platform (flip insertLRegSet) emptyLRegSet rhs
222
223 final_middle = foldl' blockSnoc middle' dropped_last
224
225 sunk' = mapUnion sunk $
226 mapFromList [ (l, filterAssignments platform (getLive l) assigs'')
227 | l <- succs ]
228
229 {- TODO: enable this later, when we have some good tests in place to
230 measure the effect and tune it.
231
232 -- small: an expression we don't mind duplicating
233 isSmall :: CmmExpr -> Bool
234 isSmall (CmmReg (CmmLocal _)) = True --
235 isSmall (CmmLit _) = True
236 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
237 isSmall (CmmRegOff (CmmLocal _) _) = True
238 isSmall _ = False
239 -}
240
241 --
242 -- We allow duplication of trivial expressions: registers (both local and
243 -- global) and literals.
244 --
245 isTrivial :: Platform -> CmmExpr -> Bool
246 isTrivial _ (CmmReg (CmmLocal _)) = True
247 isTrivial platform (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
248 if isARM (platformArch platform)
249 then True -- CodeGen.Platform.ARM does not have globalRegMaybe
250 else isJust (globalRegMaybe platform r)
251 -- GlobalRegs that are loads from BaseReg are not trivial
252 isTrivial _ (CmmLit _) = True
253 isTrivial _ _ = False
254
255 --
256 -- annotate each node with the set of registers live *after* the node
257 --
258 annotate :: Platform -> LRegSet -> [CmmNode O O] -> [(LRegSet, CmmNode O O)]
259 annotate platform live nodes = snd $ foldr ann (live,[]) nodes
260 where ann n (live,nodes) = (gen_killL platform n live, (live,n) : nodes)
261
262 --
263 -- Find the blocks that have multiple successors (join points)
264 --
265 findJoinPoints :: [CmmBlock] -> LabelMap Int
266 findJoinPoints blocks = mapFilter (>1) succ_counts
267 where
268 all_succs = concatMap successors blocks
269
270 succ_counts :: LabelMap Int
271 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
272
273 --
274 -- filter the list of assignments to remove any assignments that
275 -- are not live in a continuation.
276 --
277 filterAssignments :: Platform -> LRegSet -> Assignments -> Assignments
278 filterAssignments platform live assigs = reverse (go assigs [])
279 where go [] kept = kept
280 go (a@(r,_,_):as) kept | needed = go as (a:kept)
281 | otherwise = go as kept
282 where
283 needed = r `elemLRegSet` live
284 || any (conflicts platform a) (map toNode kept)
285 -- Note that we must keep assignments that are
286 -- referred to by other assignments we have
287 -- already kept.
288
289 -- -----------------------------------------------------------------------------
290 -- Walk through the nodes of a block, sinking and inlining assignments
291 -- as we go.
292 --
293 -- On input we pass in a:
294 -- * list of nodes in the block
295 -- * a list of assignments that appeared *before* this block and
296 -- that are being sunk.
297 --
298 -- On output we get:
299 -- * a new block
300 -- * a list of assignments that will be placed *after* that block.
301 --
302
303 walk :: Platform
304 -> [(LRegSet, CmmNode O O)] -- nodes of the block, annotated with
305 -- the set of registers live *after*
306 -- this node.
307
308 -> Assignments -- The current list of
309 -- assignments we are sinking.
310 -- Earlier assignments may refer
311 -- to later ones.
312
313 -> ( Block CmmNode O O -- The new block
314 , Assignments -- Assignments to sink further
315 )
316
317 walk platform nodes assigs = go nodes emptyBlock assigs
318 where
319 go [] block as = (block, as)
320 go ((live,node):ns) block as
321 | shouldDiscard node live = go ns block as
322 -- discard dead assignment
323 | Just a <- shouldSink platform node2 = go ns block (a : as1)
324 | otherwise = go ns block' as'
325 where
326 node1 = constantFoldNode platform node
327
328 (node2, as1) = tryToInline platform live node1 as
329
330 (dropped, as') = dropAssignmentsSimple platform
331 (\a -> conflicts platform a node2) as1
332
333 block' = foldl' blockSnoc block dropped `blockSnoc` node2
334
335
336 --
337 -- Heuristic to decide whether to pick up and sink an assignment
338 -- Currently we pick up all assignments to local registers. It might
339 -- be profitable to sink assignments to global regs too, but the
340 -- liveness analysis doesn't track those (yet) so we can't.
341 --
342 shouldSink :: Platform -> CmmNode e x -> Maybe Assignment
343 shouldSink platform (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem platform e)
344 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
345 shouldSink _ _other = Nothing
346
347 --
348 -- discard dead assignments. This doesn't do as good a job as
349 -- removeDeadAssignments, because it would need multiple passes
350 -- to get all the dead code, but it catches the common case of
351 -- superfluous reloads from the stack that the stack allocator
352 -- leaves behind.
353 --
354 -- Also we catch "r = r" here. You might think it would fall
355 -- out of inlining, but the inliner will see that r is live
356 -- after the instruction and choose not to inline r in the rhs.
357 --
358 shouldDiscard :: CmmNode e x -> LRegSet -> Bool
359 shouldDiscard node live
360 = case node of
361 CmmAssign r (CmmReg r') | r == r' -> True
362 CmmAssign (CmmLocal r) _ -> not (r `elemLRegSet` live)
363 _otherwise -> False
364
365
366 toNode :: Assignment -> CmmNode O O
367 toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
368
369 dropAssignmentsSimple :: Platform -> (Assignment -> Bool) -> Assignments
370 -> ([CmmNode O O], Assignments)
371 dropAssignmentsSimple platform f = dropAssignments platform (\a _ -> (f a, ())) ()
372
373 dropAssignments :: Platform -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
374 -> ([CmmNode O O], Assignments)
375 dropAssignments platform should_drop state assigs
376 = (dropped, reverse kept)
377 where
378 (dropped,kept) = go state assigs [] []
379
380 go _ [] dropped kept = (dropped, kept)
381 go state (assig : rest) dropped kept
382 | conflict = go state' rest (toNode assig : dropped) kept
383 | otherwise = go state' rest dropped (assig:kept)
384 where
385 (dropit, state') = should_drop assig state
386 conflict = dropit || any (conflicts platform assig) dropped
387
388
389 -- -----------------------------------------------------------------------------
390 -- Try to inline assignments into a node.
391 -- This also does constant folding for primpops, since
392 -- inlining opens up opportunities for doing so.
393
394 tryToInline
395 :: forall x. Platform
396 -> LRegSet -- set of registers live after this
397 -- -> LocalRegSet -- set of registers live after this
398 -- node. We cannot inline anything
399 -- that is live after the node, unless
400 -- it is small enough to duplicate.
401 -> CmmNode O x -- The node to inline into
402 -> Assignments -- Assignments to inline
403 -> (
404 CmmNode O x -- New node
405 , Assignments -- Remaining assignments
406 )
407
408 tryToInline platform liveAfter node assigs =
409 -- pprTrace "tryToInline assig length:" (ppr $ length assigs) $
410 go usages liveAfter node emptyLRegSet assigs
411 where
412 usages :: UniqFM LocalReg Int -- Maps each LocalReg to a count of how often it is used
413 usages = foldLocalRegsUsed platform addUsage emptyUFM node
414
415 go :: UniqFM LocalReg Int -> LRegSet -> CmmNode O x -> LRegSet -> Assignments
416 -> (CmmNode O x, Assignments)
417 go _usages _live node _skipped [] = (node, [])
418
419 go usages live node skipped (a@(l,rhs,_) : rest)
420 | cannot_inline = dont_inline
421 | occurs_none = discard -- Note [discard during inlining]
422 | occurs_once = inline_and_discard
423 | isTrivial platform rhs = inline_and_keep
424 | otherwise = dont_inline
425 where
426 inline_and_discard = go usages' live inl_node skipped rest
427 where usages' = foldLocalRegsUsed platform addUsage usages rhs
428
429 discard = go usages live node skipped rest
430
431 dont_inline = keep node -- don't inline the assignment, keep it
432 inline_and_keep = keep inl_node -- inline the assignment, keep it
433
434 keep :: CmmNode O x -> (CmmNode O x, Assignments)
435 keep node' = (final_node, a : rest')
436 where (final_node, rest') = go usages live' node' (insertLRegSet l skipped) rest
437
438 -- Avoid discarding of assignments to vars on the rhs.
439 -- See Note [Keeping assignemnts mentioned in skipped RHSs]
440 -- usages' = foldLocalRegsUsed platform (\m r -> addToUFM m r 2)
441 -- usages rhs
442 live' = inline foldLocalRegsUsed platform (\m r -> insertLRegSet r m)
443 live rhs
444
445 cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
446 || l `elemLRegSet` skipped
447 || not (okToInline platform rhs node)
448
449 -- How often is l used in the current node.
450 l_usages = lookupUFM usages l
451 l_live = l `elemLRegSet` live
452
453 occurs_once = not l_live && l_usages == Just 1
454 occurs_none = not l_live && l_usages == Nothing
455
456 inl_node = improveConditional (mapExpDeep inl_exp node)
457
458 inl_exp :: CmmExpr -> CmmExpr
459 -- inl_exp is where the inlining actually takes place!
460 inl_exp (CmmReg (CmmLocal l')) | l == l' = rhs
461 inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
462 = cmmOffset platform rhs off
463 -- re-constant fold after inlining
464 inl_exp (CmmMachOp op args) = cmmMachOpFold platform op args
465 inl_exp other = other
466
467 {- Note [Keeping assignemnts mentioned in skipped RHSs]
468 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
469
470 If we have to assignments: [z = y, y = e1] and we skip
471 z we *must* retain the assignment y = e1. This is because
472 we might inline "z = y" into another node later on so we
473 must ensure y is still defined at this point.
474
475 If we dropped the assignment of "y = e1" then we would end up
476 referencing a variable which hasn't been mentioned after
477 inlining.
478
479 We use a hack to do this.
480
481 We pretend the regs from the rhs are live after the current
482 node. Since we only discard assignments to variables
483 which are dead after the current block this prevents discarding of the
484 assignment. It still allows inlining should e1 be a trivial rhs
485 however.
486
487 -}
488
489 {- Note [improveConditional]
490
491 cmmMachOpFold tries to simplify conditionals to turn things like
492 (a == b) != 1
493 into
494 (a != b)
495 but there's one case it can't handle: when the comparison is over
496 floating-point values, we can't invert it, because floating-point
497 comparisons aren't invertible (because of NaNs).
498
499 But we *can* optimise this conditional by swapping the true and false
500 branches. Given
501 CmmCondBranch ((a >## b) != 1) t f
502 we can turn it into
503 CmmCondBranch (a >## b) f t
504
505 So here we catch conditionals that weren't optimised by cmmMachOpFold,
506 and apply above transformation to eliminate the comparison against 1.
507
508 It's tempting to just turn every != into == and then let cmmMachOpFold
509 do its thing, but that risks changing a nice fall-through conditional
510 into one that requires two jumps. (see swapcond_last in
511 GHC.Cmm.ContFlowOpt), so instead we carefully look for just the cases where
512 we can eliminate a comparison.
513 -}
514 improveConditional :: CmmNode O x -> CmmNode O x
515 improveConditional
516 (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l)
517 | neLike mop, isComparisonExpr x
518 = CmmCondBranch x f t (fmap not l)
519 where
520 neLike (MO_Ne _) = True
521 neLike (MO_U_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1
522 neLike (MO_S_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1
523 neLike _ = False
524 improveConditional other = other
525
526 -- Note [dependent assignments]
527 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
528 --
529 -- If our assignment list looks like
530 --
531 -- [ y = e, x = ... y ... ]
532 --
533 -- We cannot inline x. Remember this list is really in reverse order,
534 -- so it means x = ... y ...; y = e
535 --
536 -- Hence if we inline x, the outer assignment to y will capture the
537 -- reference in x's right hand side.
538 --
539 -- In this case we should rename the y in x's right-hand side,
540 -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
541 -- Now we can go ahead and inline x.
542 --
543 -- For now we do nothing, because this would require putting
544 -- everything inside UniqSM.
545 --
546 -- One more variant of this (#7366):
547 --
548 -- [ y = e, y = z ]
549 --
550 -- If we don't want to inline y = e, because y is used many times, we
551 -- might still be tempted to inline y = z (because we always inline
552 -- trivial rhs's). But of course we can't, because y is equal to e,
553 -- not z.
554
555 -- Note [discard during inlining]
556 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
557 -- Opportunities to discard assignments sometimes appear after we've
558 -- done some inlining. Here's an example:
559 --
560 -- x = R1;
561 -- y = P64[x + 7];
562 -- z = P64[x + 15];
563 -- /* z is dead */
564 -- R1 = y & (-8);
565 --
566 -- The x assignment is trivial, so we inline it in the RHS of y, and
567 -- keep both x and y. z gets dropped because it is dead, then we
568 -- inline y, and we have a dead assignment to x. If we don't notice
569 -- that x is dead in tryToInline, we end up retaining it.
570
571 addUsage :: UniqFM LocalReg Int -> LocalReg -> UniqFM LocalReg Int
572 addUsage m r = addToUFM_C (+) m r 1
573
574 regsUsedIn :: LRegSet -> CmmExpr -> Bool
575 regsUsedIn ls _ | nullLRegSet ls = False
576 regsUsedIn ls e = go ls e False
577 where use :: LRegSet -> CmmExpr -> Bool -> Bool
578 use ls (CmmReg (CmmLocal l)) _ | l `elemLRegSet` ls = True
579 use ls (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True
580 use _ls _ z = z
581
582 go :: LRegSet -> CmmExpr -> Bool -> Bool
583 go ls (CmmMachOp _ es) z = foldr (go ls) z es
584 go ls (CmmLoad addr _) z = go ls addr z
585 go ls e z = use ls e z
586
587 -- we don't inline into CmmUnsafeForeignCall if the expression refers
588 -- to global registers. This is a HACK to avoid global registers
589 -- clashing with C argument-passing registers, really the back-end
590 -- ought to be able to handle it properly, but currently neither PprC
591 -- nor the NCG can do it. See Note [Register parameter passing]
592 -- See also GHC.StgToCmm.Foreign.load_args_into_temps.
593 okToInline :: Platform -> CmmExpr -> CmmNode e x -> Bool
594 okToInline platform expr node@(CmmUnsafeForeignCall{}) =
595 not (globalRegistersConflict platform expr node)
596 okToInline _ _ _ = True
597
598 -- -----------------------------------------------------------------------------
599
600 -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
601 -- @r = e@ can be safely commuted past statement @node@.
602 conflicts :: Platform -> Assignment -> CmmNode O x -> Bool
603 conflicts platform (r, rhs, addr) node
604
605 -- (1) node defines registers used by rhs of assignment. This catches
606 -- assignments and all three kinds of calls. See Note [Sinking and calls]
607 | globalRegistersConflict platform rhs node = True
608 | localRegistersConflict platform rhs node = True
609
610 -- (2) node uses register defined by assignment
611 | foldRegsUsed platform (\b r' -> r == r' || b) False node = True
612
613 -- (3) a store to an address conflicts with a read of the same memory
614 | CmmStore addr' e <- node
615 , memConflicts addr (loadAddr platform addr' (cmmExprWidth platform e)) = True
616
617 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
618 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
619 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
620 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
621
622 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
623 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
624
625 -- (6) suspendThread clobbers every global register not backed by a real
626 -- register. It also clobbers heap and stack but this is handled by (5)
627 | CmmUnsafeForeignCall (PrimTarget MO_SuspendThread) _ _ <- node
628 , foldRegsUsed platform (\b g -> globalRegMaybe platform g == Nothing || b) False rhs
629 = True
630
631 -- (7) native calls clobber any memory
632 | CmmCall{} <- node, memConflicts addr AnyMem = True
633
634 -- (8) otherwise, no conflict
635 | otherwise = False
636
637 {- Note [Inlining foldRegsDefd]
638 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
639
640 foldRegsDefd is, after optimization, *not* a small function so
641 it's only marked INLINEABLE, but not INLINE.
642
643 However in some specific cases we call it *very* often making it
644 important to avoid the overhead of allocating the folding function.
645
646 So we simply force inlining via the magic inline function.
647 For T3294 this improves allocation with -O by ~1%.
648
649 -}
650
651 -- Returns True if node defines any global registers that are used in the
652 -- Cmm expression
653 globalRegistersConflict :: Platform -> CmmExpr -> CmmNode e x -> Bool
654 globalRegistersConflict platform expr node =
655 -- See Note [Inlining foldRegsDefd]
656 inline foldRegsDefd platform (\b r -> b || regUsedIn platform (CmmGlobal r) expr)
657 False node
658
659 -- Returns True if node defines any local registers that are used in the
660 -- Cmm expression
661 localRegistersConflict :: Platform -> CmmExpr -> CmmNode e x -> Bool
662 localRegistersConflict platform expr node =
663 -- See Note [Inlining foldRegsDefd]
664 inline foldRegsDefd platform (\b r -> b || regUsedIn platform (CmmLocal r) expr)
665 False node
666
667 -- Note [Sinking and calls]
668 -- ~~~~~~~~~~~~~~~~~~~~~~~~
669 --
670 -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
671 -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
672 -- stack layout (see Note [Sinking after stack layout]) which leads to two
673 -- invariants related to calls:
674 --
675 -- a) during stack layout phase all safe foreign calls are turned into
676 -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
677 -- means that we will never encounter CmmForeignCall node when running
678 -- sinking after stack layout
679 --
680 -- b) stack layout saves all variables live across a call on the stack
681 -- just before making a call (remember we are not sinking assignments to
682 -- stack):
683 --
684 -- L1:
685 -- x = R1
686 -- P64[Sp - 16] = L2
687 -- P64[Sp - 8] = x
688 -- Sp = Sp - 16
689 -- call f() returns L2
690 -- L2:
691 --
692 -- We will attempt to sink { x = R1 } but we will detect conflict with
693 -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
694 -- checking whether it conflicts with { call f() }. In this way we will
695 -- never need to check any assignment conflicts with CmmCall. Remember
696 -- that we still need to check for potential memory conflicts.
697 --
698 -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
699 -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
700 -- This assumption holds only when we do sinking after stack layout. If we run
701 -- it before stack layout we need to check for possible conflicts with all three
702 -- kinds of calls. Our `conflicts` function does that by using a generic
703 -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
704 -- UserOfRegs typeclasses.
705 --
706
707 -- An abstraction of memory read or written.
708 data AbsMem
709 = NoMem -- no memory accessed
710 | AnyMem -- arbitrary memory
711 | HeapMem -- definitely heap memory
712 | StackMem -- definitely stack memory
713 | SpMem -- <size>[Sp+n]
714 {-# UNPACK #-} !Int
715 {-# UNPACK #-} !Int
716
717 -- Having SpMem is important because it lets us float loads from Sp
718 -- past stores to Sp as long as they don't overlap, and this helps to
719 -- unravel some long sequences of
720 -- x1 = [Sp + 8]
721 -- x2 = [Sp + 16]
722 -- ...
723 -- [Sp + 8] = xi
724 -- [Sp + 16] = xj
725 --
726 -- Note that SpMem is invalidated if Sp is changed, but the definition
727 -- of 'conflicts' above handles that.
728
729 -- ToDo: this won't currently fix the following commonly occurring code:
730 -- x1 = [R1 + 8]
731 -- x2 = [R1 + 16]
732 -- ..
733 -- [Hp - 8] = x1
734 -- [Hp - 16] = x2
735 -- ..
736
737 -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
738 -- assignments to [Hp + n] do not conflict with any other heap memory,
739 -- but this is tricky to nail down. What if we had
740 --
741 -- x = Hp + n
742 -- [x] = ...
743 --
744 -- the store to [x] should be "new heap", not "old heap".
745 -- Furthermore, you could imagine that if we started inlining
746 -- functions in Cmm then there might well be reads of heap memory
747 -- that was written in the same basic block. To take advantage of
748 -- non-aliasing of heap memory we will have to be more clever.
749
750 -- Note [Foreign calls clobber heap]
751 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
752 --
753 -- It is tempting to say that foreign calls clobber only
754 -- non-heap/stack memory, but unfortunately we break this invariant in
755 -- the RTS. For example, in stg_catch_retry_frame we call
756 -- stmCommitNestedTransaction() which modifies the contents of the
757 -- TRec it is passed (this actually caused incorrect code to be
758 -- generated).
759 --
760 -- Since the invariant is true for the majority of foreign calls,
761 -- perhaps we ought to have a special annotation for calls that can
762 -- modify heap/stack memory. For now we just use the conservative
763 -- definition here.
764 --
765 -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
766 -- therefore we should never float any memory operations across one of
767 -- these calls.
768 --
769 -- `suspendThread` releases the capability used by the thread, hence we mustn't
770 -- float accesses to heap, stack or virtual global registers stored in the
771 -- capability (e.g. with unregisterised build, see #19237).
772
773
774 bothMems :: AbsMem -> AbsMem -> AbsMem
775 bothMems NoMem x = x
776 bothMems x NoMem = x
777 bothMems HeapMem HeapMem = HeapMem
778 bothMems StackMem StackMem = StackMem
779 bothMems (SpMem o1 w1) (SpMem o2 w2)
780 | o1 == o2 = SpMem o1 (max w1 w2)
781 | otherwise = StackMem
782 bothMems SpMem{} StackMem = StackMem
783 bothMems StackMem SpMem{} = StackMem
784 bothMems _ _ = AnyMem
785
786 memConflicts :: AbsMem -> AbsMem -> Bool
787 memConflicts NoMem _ = False
788 memConflicts _ NoMem = False
789 memConflicts HeapMem StackMem = False
790 memConflicts StackMem HeapMem = False
791 memConflicts SpMem{} HeapMem = False
792 memConflicts HeapMem SpMem{} = False
793 memConflicts (SpMem o1 w1) (SpMem o2 w2)
794 | o1 < o2 = o1 + w1 > o2
795 | otherwise = o2 + w2 > o1
796 memConflicts _ _ = True
797
798 exprMem :: Platform -> CmmExpr -> AbsMem
799 exprMem platform (CmmLoad addr w) = bothMems (loadAddr platform addr (typeWidth w)) (exprMem platform addr)
800 exprMem platform (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem platform) es)
801 exprMem _ _ = NoMem
802
803 loadAddr :: Platform -> CmmExpr -> Width -> AbsMem
804 loadAddr platform e w =
805 case e of
806 CmmReg r -> regAddr platform r 0 w
807 CmmRegOff r i -> regAddr platform r i w
808 _other | regUsedIn platform spReg e -> StackMem
809 | otherwise -> AnyMem
810
811 regAddr :: Platform -> CmmReg -> Int -> Width -> AbsMem
812 regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
813 regAddr _ (CmmGlobal Hp) _ _ = HeapMem
814 regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
815 regAddr platform r _ _ | isGcPtrType (cmmRegType platform r) = HeapMem -- yay! GCPtr pays for itself
816 regAddr _ _ _ _ = AnyMem
817
818 {-
819 Note [Inline GlobalRegs?]
820 ~~~~~~~~~~~~~~~~~~~~~~~~~
821
822 Should we freely inline GlobalRegs?
823
824 Actually it doesn't make a huge amount of difference either way, so we
825 *do* currently treat GlobalRegs as "trivial" and inline them
826 everywhere, but for what it's worth, here is what I discovered when I
827 (SimonM) looked into this:
828
829 Common sense says we should not inline GlobalRegs, because when we
830 have
831
832 x = R1
833
834 the register allocator will coalesce this assignment, generating no
835 code, and simply record the fact that x is bound to $rbx (or
836 whatever). Furthermore, if we were to sink this assignment, then the
837 range of code over which R1 is live increases, and the range of code
838 over which x is live decreases. All things being equal, it is better
839 for x to be live than R1, because R1 is a fixed register whereas x can
840 live in any register. So we should neither sink nor inline 'x = R1'.
841
842 However, not inlining GlobalRegs can have surprising
843 consequences. e.g. (cgrun020)
844
845 c3EN:
846 _s3DB::P64 = R1;
847 _c3ES::P64 = _s3DB::P64 & 7;
848 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
849 c3EU:
850 _s3DD::P64 = P64[_s3DB::P64 + 6];
851 _s3DE::P64 = P64[_s3DB::P64 + 14];
852 I64[Sp - 8] = c3F0;
853 R1 = _s3DE::P64;
854 P64[Sp] = _s3DD::P64;
855
856 inlining the GlobalReg gives:
857
858 c3EN:
859 if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
860 c3EU:
861 I64[Sp - 8] = c3F0;
862 _s3DD::P64 = P64[R1 + 6];
863 R1 = P64[R1 + 14];
864 P64[Sp] = _s3DD::P64;
865
866 but if we don't inline the GlobalReg, instead we get:
867
868 _s3DB::P64 = R1;
869 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
870 c3EU:
871 I64[Sp - 8] = c3F0;
872 R1 = P64[_s3DB::P64 + 14];
873 P64[Sp] = P64[_s3DB::P64 + 6];
874
875 This looks better - we managed to inline _s3DD - but in fact it
876 generates an extra reg-reg move:
877
878 .Lc3EU:
879 movq $c3F0_info,-8(%rbp)
880 movq %rbx,%rax
881 movq 14(%rbx),%rbx
882 movq 6(%rax),%rax
883 movq %rax,(%rbp)
884
885 because _s3DB is now live across the R1 assignment, we lost the
886 benefit of coalescing.
887
888 Who is at fault here? Perhaps if we knew that _s3DB was an alias for
889 R1, then we would not sink a reference to _s3DB past the R1
890 assignment. Or perhaps we *should* do that - we might gain by sinking
891 it, despite losing the coalescing opportunity.
892
893 Sometimes not inlining global registers wins by virtue of the rule
894 about not inlining into arguments of a foreign call, e.g. (T7163) this
895 is what happens when we inlined F1:
896
897 _s3L2::F32 = F1;
898 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
899 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
900
901 but if we don't inline F1:
902
903 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
904 10.0 :: W32));
905 -}