never executed always true always false
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18
19 {-# LANGUAGE BangPatterns #-}
20
21 module GHC.Core.Unfold (
22 Unfolding, UnfoldingGuidance, -- Abstract types
23
24 UnfoldingOpts (..), defaultUnfoldingOpts,
25 updateCreationThreshold, updateUseThreshold,
26 updateFunAppDiscount, updateDictDiscount,
27 updateVeryAggressive, updateCaseScaling,
28 updateCaseThreshold, updateReportPrefix,
29
30 ArgSummary(..),
31
32 couldBeSmallEnoughToInline, inlineBoringOk,
33 smallEnoughToInline,
34
35 callSiteInline, CallCtxt(..),
36 calcUnfoldingGuidance
37 ) where
38
39 import GHC.Prelude
40
41 import GHC.Driver.Flags
42
43 import GHC.Core
44 import GHC.Core.Utils
45 import GHC.Types.Id
46 import GHC.Core.DataCon
47 import GHC.Types.Literal
48 import GHC.Builtin.PrimOps
49 import GHC.Types.Id.Info
50 import GHC.Types.Basic ( Arity )
51 import GHC.Core.Type
52 import GHC.Builtin.Names
53 import GHC.Builtin.Types.Prim ( realWorldStatePrimTy )
54 import GHC.Data.Bag
55 import GHC.Utils.Logger
56 import GHC.Utils.Misc
57 import GHC.Utils.Outputable
58 import GHC.Types.ForeignCall
59 import GHC.Types.Name
60 import GHC.Types.Tickish
61
62 import qualified Data.ByteString as BS
63 import Data.List (isPrefixOf)
64
65
66 -- | Unfolding options
67 data UnfoldingOpts = UnfoldingOpts
68 { unfoldingCreationThreshold :: !Int
69 -- ^ Threshold above which unfoldings are not *created*
70
71 , unfoldingUseThreshold :: !Int
72 -- ^ Threshold above which unfoldings are not *inlined*
73
74 , unfoldingFunAppDiscount :: !Int
75 -- ^ Discount for lambdas that are used (applied)
76
77 , unfoldingDictDiscount :: !Int
78 -- ^ Discount for dictionaries
79
80 , unfoldingVeryAggressive :: !Bool
81 -- ^ Force inlining in many more cases
82
83 , unfoldingCaseThreshold :: !Int
84 -- ^ Don't consider depth up to x
85
86 , unfoldingCaseScaling :: !Int
87 -- ^ Penalize depth with 1/x
88
89 , unfoldingReportPrefix :: !(Maybe String)
90 -- ^ Only report inlining decisions for names with this prefix
91 }
92
93 defaultUnfoldingOpts :: UnfoldingOpts
94 defaultUnfoldingOpts = UnfoldingOpts
95 { unfoldingCreationThreshold = 750
96 -- The unfoldingCreationThreshold threshold must be reasonably high
97 -- to take account of possible discounts.
98 -- E.g. 450 is not enough in 'fulsom' for Interval.sqr to
99 -- inline into Csg.calc (The unfolding for sqr never makes it
100 -- into the interface file.)
101
102 , unfoldingUseThreshold = 90
103 -- Last adjusted upwards in #18282, when I reduced
104 -- the result discount for constructors.
105
106 , unfoldingFunAppDiscount = 60
107 -- Be fairly keen to inline a function if that means
108 -- we'll be able to pick the right method from a dictionary
109
110 , unfoldingDictDiscount = 30
111 -- Be fairly keen to inline a function if that means
112 -- we'll be able to pick the right method from a dictionary
113
114 , unfoldingVeryAggressive = False
115
116 -- Only apply scaling once we are deeper than threshold cases
117 -- in an RHS.
118 , unfoldingCaseThreshold = 2
119
120 -- Penalize depth with (size*depth)/scaling
121 , unfoldingCaseScaling = 30
122
123 -- Don't filter inlining decision reports
124 , unfoldingReportPrefix = Nothing
125 }
126
127 -- Helpers for "GHC.Driver.Session"
128
129 updateCreationThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
130 updateCreationThreshold n opts = opts { unfoldingCreationThreshold = n }
131
132 updateUseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
133 updateUseThreshold n opts = opts { unfoldingUseThreshold = n }
134
135 updateFunAppDiscount :: Int -> UnfoldingOpts -> UnfoldingOpts
136 updateFunAppDiscount n opts = opts { unfoldingFunAppDiscount = n }
137
138 updateDictDiscount :: Int -> UnfoldingOpts -> UnfoldingOpts
139 updateDictDiscount n opts = opts { unfoldingDictDiscount = n }
140
141 updateVeryAggressive :: Bool -> UnfoldingOpts -> UnfoldingOpts
142 updateVeryAggressive n opts = opts { unfoldingVeryAggressive = n }
143
144
145 updateCaseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
146 updateCaseThreshold n opts = opts { unfoldingCaseThreshold = n }
147
148 updateCaseScaling :: Int -> UnfoldingOpts -> UnfoldingOpts
149 updateCaseScaling n opts = opts { unfoldingCaseScaling = n }
150
151 updateReportPrefix :: Maybe String -> UnfoldingOpts -> UnfoldingOpts
152 updateReportPrefix n opts = opts { unfoldingReportPrefix = n }
153
154 {-
155 Note [Occurrence analysis of unfoldings]
156 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
157 We do occurrence-analysis of unfoldings once and for all, when the
158 unfolding is built, rather than each time we inline them.
159
160 But given this decision it's vital that we do
161 *always* do it. Consider this unfolding
162 \x -> letrec { f = ...g...; g* = f } in body
163 where g* is (for some strange reason) the loop breaker. If we don't
164 occ-anal it when reading it in, we won't mark g as a loop breaker, and
165 we may inline g entirely in body, dropping its binding, and leaving
166 the occurrence in f out of scope. This happened in #8892, where
167 the unfolding in question was a DFun unfolding.
168
169 But more generally, the simplifier is designed on the
170 basis that it is looking at occurrence-analysed expressions, so better
171 ensure that they actually are.
172
173 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
174 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
175 Notice that we give the non-occur-analysed expression to
176 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
177 first; for example, sometimes during simplification, there's a large
178 let-bound thing which has been substituted, and so is now dead; so
179 'expr' contains two copies of the thing while the occurrence-analysed
180 expression doesn't.
181
182 Nevertheless, we *don't* and *must not* occ-analyse before computing
183 the size because
184
185 a) The size computation bales out after a while, whereas occurrence
186 analysis does not.
187
188 b) Residency increases sharply if you occ-anal first. I'm not
189 100% sure why, but it's a large effect. Compiling Cabal went
190 from residency of 534M to over 800M with this one change.
191
192 This can occasionally mean that the guidance is very pessimistic;
193 it gets fixed up next round. And it should be rare, because large
194 let-bound things that are dead are usually caught by preInlineUnconditionally
195
196
197 ************************************************************************
198 * *
199 \subsection{The UnfoldingGuidance type}
200 * *
201 ************************************************************************
202 -}
203
204 inlineBoringOk :: CoreExpr -> Bool
205 -- See Note [INLINE for small functions]
206 -- True => the result of inlining the expression is
207 -- no bigger than the expression itself
208 -- eg (\x y -> f y x)
209 -- This is a quick and dirty version. It doesn't attempt
210 -- to deal with (\x y z -> x (y z))
211 -- The really important one is (x `cast` c)
212 inlineBoringOk e
213 = go 0 e
214 where
215 go :: Int -> CoreExpr -> Bool
216 go credit (Lam x e) | isId x = go (credit+1) e
217 | otherwise = go credit e
218 -- See Note [Count coercion arguments in boring contexts]
219 go credit (App f (Type {})) = go credit f
220 go credit (App f a) | credit > 0
221 , exprIsTrivial a = go (credit-1) f
222 go credit (Tick _ e) = go credit e -- dubious
223 go credit (Cast e _) = go credit e
224 go credit (Case scrut _ _ [Alt _ _ rhs]) -- See Note [Inline unsafeCoerce]
225 | isUnsafeEqualityProof scrut = go credit rhs
226 go _ (Var {}) = boringCxtOk
227 go _ _ = boringCxtNotOk
228
229 calcUnfoldingGuidance
230 :: UnfoldingOpts
231 -> Bool -- Definitely a top-level, bottoming binding
232 -> CoreExpr -- Expression to look at
233 -> UnfoldingGuidance
234 calcUnfoldingGuidance opts is_top_bottoming (Tick t expr)
235 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
236 = calcUnfoldingGuidance opts is_top_bottoming expr
237 calcUnfoldingGuidance opts is_top_bottoming expr
238 = case sizeExpr opts bOMB_OUT_SIZE val_bndrs body of
239 TooBig -> UnfNever
240 SizeIs size cased_bndrs scrut_discount
241 | uncondInline expr n_val_bndrs size
242 -> UnfWhen { ug_unsat_ok = unSaturatedOk
243 , ug_boring_ok = boringCxtOk
244 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
245
246 | is_top_bottoming
247 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
248
249 | otherwise
250 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
251 , ug_size = size
252 , ug_res = scrut_discount }
253
254 where
255 (bndrs, body) = collectBinders expr
256 bOMB_OUT_SIZE = unfoldingCreationThreshold opts
257 -- Bomb out if size gets bigger than this
258 val_bndrs = filter isId bndrs
259 n_val_bndrs = length val_bndrs
260
261 mk_discount :: Bag (Id,Int) -> Id -> Int
262 mk_discount cbs bndr = foldl' combine 0 cbs
263 where
264 combine acc (bndr', disc)
265 | bndr == bndr' = acc `plus_disc` disc
266 | otherwise = acc
267
268 plus_disc :: Int -> Int -> Int
269 plus_disc | isFunTy (idType bndr) = max
270 | otherwise = (+)
271 -- See Note [Function and non-function discounts]
272
273 {- Note [Inline unsafeCoerce]
274 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
275 We really want to inline unsafeCoerce, even when applied to boring
276 arguments. It doesn't look as if its RHS is smaller than the call
277 unsafeCoerce x = case unsafeEqualityProof @a @b of UnsafeRefl -> x
278 but that case is discarded -- see Note [Implementing unsafeCoerce]
279 in base:Unsafe.Coerce.
280
281 Moreover, if we /don't/ inline it, we may be left with
282 f (unsafeCoerce x)
283 which will build a thunk -- bad, bad, bad.
284
285 Conclusion: we really want inlineBoringOk to be True of the RHS of
286 unsafeCoerce. This is (U4) in Note [Implementing unsafeCoerce].
287
288 Note [Computing the size of an expression]
289 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
290 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
291 heuristics right has taken a long time. Here's the basic strategy:
292
293 * Variables, literals: 0
294 (Exception for string literals, see litSize.)
295
296 * Function applications (f e1 .. en): 1 + #value args
297
298 * Constructor applications: 1, regardless of #args
299
300 * Let(rec): 1 + size of components
301
302 * Note, cast: 0
303
304 Examples
305
306 Size Term
307 --------------
308 0 42#
309 0 x
310 0 True
311 2 f x
312 1 Just x
313 4 f (g x)
314
315 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
316 a function call to account for. Notice also that constructor applications
317 are very cheap, because exposing them to a caller is so valuable.
318
319 [25/5/11] All sizes are now multiplied by 10, except for primops
320 (which have sizes like 1 or 4. This makes primops look fantastically
321 cheap, and seems to be almost universally beneficial. Done partly as a
322 result of #4978.
323
324 Note [Do not inline top-level bottoming functions]
325 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
326 The FloatOut pass has gone to some trouble to float out calls to 'error'
327 and similar friends. See Note [Bottoming floats] in GHC.Core.Opt.SetLevels.
328 Do not re-inline them! But we *do* still inline if they are very small
329 (the uncondInline stuff).
330
331 Note [INLINE for small functions]
332 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
333 Consider {-# INLINE f #-}
334 f x = Just x
335 g y = f y
336 Then f's RHS is no larger than its LHS, so we should inline it into
337 even the most boring context. In general, f the function is
338 sufficiently small that its body is as small as the call itself, the
339 inline unconditionally, regardless of how boring the context is.
340
341 Things to note:
342
343 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
344 than the thing it's replacing. Notice that
345 (f x) --> (g 3) -- YES, unconditionally
346 (f x) --> x : [] -- YES, *even though* there are two
347 -- arguments to the cons
348 x --> g 3 -- NO
349 x --> Just v -- NO
350
351 It's very important not to unconditionally replace a variable by
352 a non-atomic term.
353
354 (2) We do this even if the thing isn't saturated, else we end up with the
355 silly situation that
356 f x y = x
357 ...map (f 3)...
358 doesn't inline. Even in a boring context, inlining without being
359 saturated will give a lambda instead of a PAP, and will be more
360 efficient at runtime.
361
362 (3) However, when the function's arity > 0, we do insist that it
363 has at least one value argument at the call site. (This check is
364 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
365 f = /\a \x:a. x
366 d = /\b. MkD (f b)
367 If we inline f here we get
368 d = /\b. MkD (\x:b. x)
369 and then prepareRhs floats out the argument, abstracting the type
370 variables, so we end up with the original again!
371
372 (4) We must be much more cautious about arity-zero things. Consider
373 let x = y +# z in ...
374 In *size* terms primops look very small, because the generate a
375 single instruction, but we do not want to unconditionally replace
376 every occurrence of x with (y +# z). So we only do the
377 unconditional-inline thing for *trivial* expressions.
378
379 NB: you might think that PostInlineUnconditionally would do this
380 but it doesn't fire for top-level things; see GHC.Core.Opt.Simplify.Utils
381 Note [Top level and postInlineUnconditionally]
382
383 Note [Count coercion arguments in boring contexts]
384 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
385 In inlineBoringOK, we ignore type arguments when deciding whether an
386 expression is okay to inline into boring contexts. This is good, since
387 if we have a definition like
388
389 let y = x @Int in f y y
390
391 there’s no reason not to inline y at both use sites — no work is
392 actually duplicated. It may seem like the same reasoning applies to
393 coercion arguments, and indeed, in #17182 we changed inlineBoringOK to
394 treat coercions the same way.
395
396 However, this isn’t a good idea: unlike type arguments, which have
397 no runtime representation, coercion arguments *do* have a runtime
398 representation (albeit the zero-width VoidRep, see Note [Coercion tokens]
399 in "GHC.CoreToStg"). This caused trouble in #17787 for DataCon wrappers for
400 nullary GADT constructors: the wrappers would be inlined and each use of
401 the constructor would lead to a separate allocation instead of just
402 sharing the wrapper closure.
403
404 The solution: don’t ignore coercion arguments after all.
405 -}
406
407 uncondInline :: CoreExpr -> Arity -> Int -> Bool
408 -- Inline unconditionally if there no size increase
409 -- Size of call is arity (+1 for the function)
410 -- See Note [INLINE for small functions]
411 uncondInline rhs arity size
412 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
413 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
414
415 sizeExpr :: UnfoldingOpts
416 -> Int -- Bomb out if it gets bigger than this
417 -> [Id] -- Arguments; we're interested in which of these
418 -- get case'd
419 -> CoreExpr
420 -> ExprSize
421
422 -- Note [Computing the size of an expression]
423
424 -- Forcing bOMB_OUT_SIZE early prevents repeated
425 -- unboxing of the Int argument.
426 sizeExpr opts !bOMB_OUT_SIZE top_args expr
427 = size_up expr
428 where
429 size_up (Cast e _) = size_up e
430 size_up (Tick _ e) = size_up e
431 size_up (Type _) = sizeZero -- Types cost nothing
432 size_up (Coercion _) = sizeZero
433 size_up (Lit lit) = sizeN (litSize lit)
434 size_up (Var f) | isRealWorldId f = sizeZero
435 -- Make sure we get constructor discounts even
436 -- on nullary constructors
437 | otherwise = size_up_call f [] 0
438
439 size_up (App fun arg)
440 | isTyCoArg arg = size_up fun
441 | otherwise = size_up arg `addSizeNSD`
442 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
443
444 size_up (Lam b e)
445 | isId b && not (isRealWorldId b) = lamScrutDiscount opts (size_up e `addSizeN` 10)
446 | otherwise = size_up e
447
448 size_up (Let (NonRec binder rhs) body)
449 = size_up_rhs (binder, rhs) `addSizeNSD`
450 size_up body `addSizeN`
451 size_up_alloc binder
452
453 size_up (Let (Rec pairs) body)
454 = foldr (addSizeNSD . size_up_rhs)
455 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
456 pairs
457
458 size_up (Case e _ _ alts)
459 | null alts
460 = size_up e -- case e of {} never returns, so take size of scrutinee
461
462 size_up (Case e _ _ alts)
463 -- Now alts is non-empty
464 | Just v <- is_top_arg e -- We are scrutinising an argument variable
465 = let
466 alt_sizes = map size_up_alt alts
467
468 -- alts_size tries to compute a good discount for
469 -- the case when we are scrutinising an argument variable
470 alts_size (SizeIs tot tot_disc tot_scrut)
471 -- Size of all alternatives
472 (SizeIs max _ _)
473 -- Size of biggest alternative
474 = SizeIs tot (unitBag (v, 20 + tot - max)
475 `unionBags` tot_disc) tot_scrut
476 -- If the variable is known, we produce a
477 -- discount that will take us back to 'max',
478 -- the size of the largest alternative The
479 -- 1+ is a little discount for reduced
480 -- allocation in the caller
481 --
482 -- Notice though, that we return tot_disc,
483 -- the total discount from all branches. I
484 -- think that's right.
485
486 alts_size tot_size _ = tot_size
487 in
488 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
489 (foldr1 maxSize alt_sizes)
490 -- Good to inline if an arg is scrutinised, because
491 -- that may eliminate allocation in the caller
492 -- And it eliminates the case itself
493 where
494 is_top_arg (Var v) | v `elem` top_args = Just v
495 is_top_arg (Cast e _) = is_top_arg e
496 is_top_arg _ = Nothing
497
498
499 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
500 foldr (addAltSize . size_up_alt) case_size alts
501 where
502 case_size
503 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
504 | otherwise = sizeZero
505 -- Normally we don't charge for the case itself, but
506 -- we charge one per alternative (see size_up_alt,
507 -- below) to account for the cost of the info table
508 -- and comparisons.
509 --
510 -- However, in certain cases (see is_inline_scrut
511 -- below), no code is generated for the case unless
512 -- there are multiple alts. In these cases we
513 -- subtract one, making the first alt free.
514 -- e.g. case x# +# y# of _ -> ... should cost 1
515 -- case touch# x# of _ -> ... should cost 0
516 -- (see #4978)
517 --
518 -- I would like to not have the "lengthAtMost alts 1"
519 -- condition above, but without that some programs got worse
520 -- (spectral/hartel/event and spectral/para). I don't fully
521 -- understand why. (SDM 24/5/11)
522
523 -- unboxed variables, inline primops and unsafe foreign calls
524 -- are all "inline" things:
525 is_inline_scrut (Var v) = isUnliftedType (idType v)
526 is_inline_scrut scrut
527 | (Var f, _) <- collectArgs scrut
528 = case idDetails f of
529 FCallId fc -> not (isSafeForeignCall fc)
530 PrimOpId op -> not (primOpOutOfLine op)
531 _other -> False
532 | otherwise
533 = False
534
535 size_up_rhs (bndr, rhs)
536 | Just join_arity <- isJoinId_maybe bndr
537 -- Skip arguments to join point
538 , (_bndrs, body) <- collectNBinders join_arity rhs
539 = size_up body
540 | otherwise
541 = size_up rhs
542
543 ------------
544 -- size_up_app is used when there's ONE OR MORE value args
545 size_up_app (App fun arg) args voids
546 | isTyCoArg arg = size_up_app fun args voids
547 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
548 | otherwise = size_up arg `addSizeNSD`
549 size_up_app fun (arg:args) voids
550 size_up_app (Var fun) args voids = size_up_call fun args voids
551 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
552 size_up_app (Cast expr _) args voids = size_up_app expr args voids
553 size_up_app other args voids = size_up other `addSizeN`
554 callSize (length args) voids
555 -- if the lhs is not an App or a Var, or an invisible thing like a
556 -- Tick or Cast, then we should charge for a complete call plus the
557 -- size of the lhs itself.
558
559 ------------
560 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
561 size_up_call fun val_args voids
562 = case idDetails fun of
563 FCallId _ -> sizeN (callSize (length val_args) voids)
564 DataConWorkId dc -> conSize dc (length val_args)
565 PrimOpId op -> primOpSize op (length val_args)
566 ClassOpId _ -> classOpSize opts top_args val_args
567 _ -> funSize opts top_args fun (length val_args) voids
568
569 ------------
570 size_up_alt (Alt _con _bndrs rhs) = size_up rhs `addSizeN` 10
571 -- Don't charge for args, so that wrappers look cheap
572 -- (See comments about wrappers with Case)
573 --
574 -- IMPORTANT: *do* charge 1 for the alternative, else we
575 -- find that giant case nests are treated as practically free
576 -- A good example is Foreign.C.Error.errnoToIOError
577
578 ------------
579 -- Cost to allocate binding with given binder
580 size_up_alloc bndr
581 | isTyVar bndr -- Doesn't exist at runtime
582 || isJoinId bndr -- Not allocated at all
583 || isUnliftedType (idType bndr) -- Doesn't live in heap
584 = 0
585 | otherwise
586 = 10
587
588 ------------
589 -- These addSize things have to be here because
590 -- I don't want to give them bOMB_OUT_SIZE as an argument
591 addSizeN TooBig _ = TooBig
592 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
593
594 -- addAltSize is used to add the sizes of case alternatives
595 addAltSize TooBig _ = TooBig
596 addAltSize _ TooBig = TooBig
597 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
598 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
599 (xs `unionBags` ys)
600 (d1 + d2) -- Note [addAltSize result discounts]
601
602 -- This variant ignores the result discount from its LEFT argument
603 -- It's used when the second argument isn't part of the result
604 addSizeNSD TooBig _ = TooBig
605 addSizeNSD _ TooBig = TooBig
606 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
607 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
608 (xs `unionBags` ys)
609 d2 -- Ignore d1
610
611 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
612
613 -- an expression of type State# RealWorld must be a variable
614 isRealWorldExpr (Var id) = isRealWorldId id
615 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
616 isRealWorldExpr _ = False
617
618 -- | Finds a nominal size of a string literal.
619 litSize :: Literal -> Int
620 -- Used by GHC.Core.Unfold.sizeExpr
621 litSize (LitNumber LitNumBigNat _) = 100
622 litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4)
623 -- If size could be 0 then @f "x"@ might be too small
624 -- [Sept03: make literal strings a bit bigger to avoid fruitless
625 -- duplication of little strings]
626 litSize _other = 0 -- Must match size of nullary constructors
627 -- Key point: if x |-> 4, then x must inline unconditionally
628 -- (eg via case binding)
629
630 classOpSize :: UnfoldingOpts -> [Id] -> [CoreExpr] -> ExprSize
631 -- See Note [Conlike is interesting]
632 classOpSize _ _ []
633 = sizeZero
634 classOpSize opts top_args (arg1 : other_args)
635 = SizeIs size arg_discount 0
636 where
637 size = 20 + (10 * length other_args)
638 -- If the class op is scrutinising a lambda bound dictionary then
639 -- give it a discount, to encourage the inlining of this function
640 -- The actual discount is rather arbitrarily chosen
641 arg_discount = case arg1 of
642 Var dict | dict `elem` top_args
643 -> unitBag (dict, unfoldingDictDiscount opts)
644 _other -> emptyBag
645
646 -- | The size of a function call
647 callSize
648 :: Int -- ^ number of value args
649 -> Int -- ^ number of value args that are void
650 -> Int
651 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
652 -- The 1+ is for the function itself
653 -- Add 1 for each non-trivial arg;
654 -- the allocation cost, as in let(rec)
655
656 -- | The size of a jump to a join point
657 jumpSize
658 :: Int -- ^ number of value args
659 -> Int -- ^ number of value args that are void
660 -> Int
661 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
662 -- A jump is 20% the size of a function call. Making jumps free reopens
663 -- bug #6048, but making them any more expensive loses a 21% improvement in
664 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
665 -- better solution?
666
667 funSize :: UnfoldingOpts -> [Id] -> Id -> Int -> Int -> ExprSize
668 -- Size for functions that are not constructors or primops
669 -- Note [Function applications]
670 funSize opts top_args fun n_val_args voids
671 | fun `hasKey` buildIdKey = buildSize
672 | fun `hasKey` augmentIdKey = augmentSize
673 | otherwise = SizeIs size arg_discount res_discount
674 where
675 some_val_args = n_val_args > 0
676 is_join = isJoinId fun
677
678 size | is_join = jumpSize n_val_args voids
679 | not some_val_args = 0
680 | otherwise = callSize n_val_args voids
681
682 -- DISCOUNTS
683 -- See Note [Function and non-function discounts]
684 arg_discount | some_val_args && fun `elem` top_args
685 = unitBag (fun, unfoldingFunAppDiscount opts)
686 | otherwise = emptyBag
687 -- If the function is an argument and is applied
688 -- to some values, give it an arg-discount
689
690 res_discount | idArity fun > n_val_args = unfoldingFunAppDiscount opts
691 | otherwise = 0
692 -- If the function is partially applied, show a result discount
693 -- XXX maybe behave like ConSize for eval'd variable
694
695 conSize :: DataCon -> Int -> ExprSize
696 conSize dc n_val_args
697 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
698
699 -- See Note [Unboxed tuple size and result discount]
700 | isUnboxedTupleDataCon dc = SizeIs 0 emptyBag 10
701
702 -- See Note [Constructor size and result discount]
703 | otherwise = SizeIs 10 emptyBag 10
704
705 {- Note [Constructor size and result discount]
706 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
707 Treat a constructors application as size 10, regardless of how many
708 arguments it has; we are keen to expose them (and we charge separately
709 for their args). We can't treat them as size zero, else we find that
710 (Just x) has size 0, which is the same as a lone variable; and hence
711 'v' will always be replaced by (Just x), where v is bound to Just x.
712
713 The "result discount" is applied if the result of the call is
714 scrutinised (say by a case). For a constructor application that will
715 mean the constructor application will disappear, so we don't need to
716 charge it to the function. So the discount should at least match the
717 cost of the constructor application, namely 10.
718
719 Historical note 1: Until Jun 2020 we gave it a "bit of extra
720 incentive" via a discount of 10*(1 + n_val_args), but that was FAR too
721 much (#18282). In particular, consider a huge case tree like
722
723 let r = case y1 of
724 Nothing -> B1 a b c
725 Just v1 -> case y2 of
726 Nothing -> B1 c b a
727 Just v2 -> ...
728
729 If conSize gives a cost of 10 (regardless of n_val_args) and a
730 discount of 10, that'll make each alternative RHS cost zero. We
731 charge 10 for each case alternative (see size_up_alt). If we give a
732 bigger discount (say 20) in conSize, we'll make the case expression
733 cost *nothing*, and that can make a huge case tree cost nothing. This
734 leads to massive, sometimes exponenial inlinings (#18282). In short,
735 don't give a discount that give a negative size to a sub-expression!
736
737 Historical note 2: Much longer ago, Simon M tried a MUCH bigger
738 discount: (10 * (10 + n_val_args)), and said it was an "unambiguous
739 win", but its terribly dangerous because a function with many many
740 case branches, each finishing with a constructor, can have an
741 arbitrarily large discount. This led to terrible code bloat: see #6099.
742
743 Note [Unboxed tuple size and result discount]
744 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
745 However, unboxed tuples count as size zero. I found occasions where we had
746 f x y z = case op# x y z of { s -> (# s, () #) }
747 and f wasn't getting inlined.
748
749 I tried giving unboxed tuples a *result discount* of zero (see the
750 commented-out line). Why? When returned as a result they do not
751 allocate, so maybe we don't want to charge so much for them. If you
752 have a non-zero discount here, we find that workers often get inlined
753 back into wrappers, because it look like
754 f x = case $wf x of (# a,b #) -> (a,b)
755 and we are keener because of the case. However while this change
756 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
757 more. All other changes were very small. So it's not a big deal but I
758 didn't adopt the idea.
759
760 When fixing #18282 (see Note [Constructor size and result discount])
761 I changed the result discount to be just 10, not 10*(1+n_val_args).
762
763 Note [Function and non-function discounts]
764 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
765 We want a discount if the function is applied. A good example is
766 monadic combinators with continuation arguments, where inlining is
767 quite important.
768
769 But we don't want a big discount when a function is called many times
770 (see the detailed comments with #6048) because if the function is
771 big it won't be inlined at its many call sites and no benefit results.
772 Indeed, we can get exponentially big inlinings this way; that is what
773 #6048 is about.
774
775 On the other hand, for data-valued arguments, if there are lots of
776 case expressions in the body, each one will get smaller if we apply
777 the function to a constructor application, so we *want* a big discount
778 if the argument is scrutinised by many case expressions.
779
780 Conclusion:
781 - For functions, take the max of the discounts
782 - For data values, take the sum of the discounts
783
784
785 Note [Literal integer size]
786 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
787 Literal integers *can* be big (mkInteger [...coefficients...]), but
788 need not be (IS n). We just use an arbitrary big-ish constant here
789 so that, in particular, we don't inline top-level defns like
790 n = IS 5
791 There's no point in doing so -- any optimisations will see the IS
792 through n's unfolding. Nor will a big size inhibit unfoldings functions
793 that mention a literal Integer, because the float-out pass will float
794 all those constants to top level.
795 -}
796
797 primOpSize :: PrimOp -> Int -> ExprSize
798 primOpSize op n_val_args
799 = if primOpOutOfLine op
800 then sizeN (op_size + n_val_args)
801 else sizeN op_size
802 where
803 op_size = primOpCodeSize op
804
805
806 buildSize :: ExprSize
807 buildSize = SizeIs 0 emptyBag 40
808 -- We really want to inline applications of build
809 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
810 -- Indeed, we should add a result_discount because build is
811 -- very like a constructor. We don't bother to check that the
812 -- build is saturated (it usually is). The "-2" discounts for the \c n,
813 -- The "4" is rather arbitrary.
814
815 augmentSize :: ExprSize
816 augmentSize = SizeIs 0 emptyBag 40
817 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
818 -- e plus ys. The -2 accounts for the \cn
819
820 -- When we return a lambda, give a discount if it's used (applied)
821 lamScrutDiscount :: UnfoldingOpts -> ExprSize -> ExprSize
822 lamScrutDiscount opts (SizeIs n vs _) = SizeIs n vs (unfoldingFunAppDiscount opts)
823 lamScrutDiscount _ TooBig = TooBig
824
825 {-
826 Note [addAltSize result discounts]
827 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
828 When adding the size of alternatives, we *add* the result discounts
829 too, rather than take the *maximum*. For a multi-branch case, this
830 gives a discount for each branch that returns a constructor, making us
831 keener to inline. I did try using 'max' instead, but it makes nofib
832 'rewrite' and 'puzzle' allocate significantly more, and didn't make
833 binary sizes shrink significantly either.
834
835 Note [Discounts and thresholds]
836 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
837
838 Constants for discounts and thresholds are defined in 'UnfoldingOpts'. They are:
839
840 unfoldingCreationThreshold
841 At a definition site, if the unfolding is bigger than this, we
842 may discard it altogether
843
844 unfoldingUseThreshold
845 At a call site, if the unfolding, less discounts, is smaller than
846 this, then it's small enough inline
847
848 unfoldingDictDiscount
849 The discount for each occurrence of a dictionary argument
850 as an argument of a class method. Should be pretty small
851 else big functions may get inlined
852
853 unfoldingFunAppDiscount
854 Discount for a function argument that is applied. Quite
855 large, because if we inline we avoid the higher-order call.
856
857 unfoldingVeryAggressive
858 If True, the compiler ignores all the thresholds and inlines very
859 aggressively. It still adheres to arity, simplifier phase control and
860 loop breakers.
861
862
863 Historical Note: Before April 2020 we had another factor,
864 ufKeenessFactor, which would scale the discounts before they were subtracted
865 from the size. This was justified with the following comment:
866
867 -- We multiply the raw discounts (args_discount and result_discount)
868 -- ty opt_UnfoldingKeenessFactor because the former have to do with
869 -- *size* whereas the discounts imply that there's some extra
870 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
871 -- by inlining.
872
873 However, this is highly suspect since it means that we subtract a *scaled* size
874 from an absolute size, resulting in crazy (e.g. negative) scores in some cases
875 (#15304). We consequently killed off ufKeenessFactor and bumped up the
876 ufUseThreshold to compensate.
877
878
879 Note [Function applications]
880 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
881 In a function application (f a b)
882
883 - If 'f' is an argument to the function being analysed,
884 and there's at least one value arg, record a FunAppDiscount for f
885
886 - If the application if a PAP (arity > 2 in this example)
887 record a *result* discount (because inlining
888 with "extra" args in the call may mean that we now
889 get a saturated application)
890
891 Code for manipulating sizes
892 -}
893
894 -- | The size of a candidate expression for unfolding
895 data ExprSize
896 = TooBig
897 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
898 , _es_args :: !(Bag (Id,Int))
899 -- ^ Arguments cased herein, and discount for each such
900 , _es_discount :: {-# UNPACK #-} !Int
901 -- ^ Size to subtract if result is scrutinised by a case
902 -- expression
903 }
904
905 instance Outputable ExprSize where
906 ppr TooBig = text "TooBig"
907 ppr (SizeIs a _ c) = brackets (int a <+> int c)
908
909 -- subtract the discount before deciding whether to bale out. eg. we
910 -- want to inline a large constructor application into a selector:
911 -- tup = (a_1, ..., a_99)
912 -- x = case tup of ...
913 --
914 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
915 mkSizeIs max n xs d | (n - d) > max = TooBig
916 | otherwise = SizeIs n xs d
917
918 maxSize :: ExprSize -> ExprSize -> ExprSize
919 maxSize TooBig _ = TooBig
920 maxSize _ TooBig = TooBig
921 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
922 | otherwise = s2
923
924 sizeZero :: ExprSize
925 sizeN :: Int -> ExprSize
926
927 sizeZero = SizeIs 0 emptyBag 0
928 sizeN n = SizeIs n emptyBag 0
929
930 {-
931 ************************************************************************
932 * *
933 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
934 * *
935 ************************************************************************
936
937 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
938 we ``couldn't possibly use'' on the other side. Can be overridden w/
939 flaggery. Just the same as smallEnoughToInline, except that it has no
940 actual arguments.
941 -}
942
943 couldBeSmallEnoughToInline :: UnfoldingOpts -> Int -> CoreExpr -> Bool
944 couldBeSmallEnoughToInline opts threshold rhs
945 = case sizeExpr opts threshold [] body of
946 TooBig -> False
947 _ -> True
948 where
949 (_, body) = collectBinders rhs
950
951 ----------------
952 smallEnoughToInline :: UnfoldingOpts -> Unfolding -> Bool
953 smallEnoughToInline opts (CoreUnfolding {uf_guidance = guidance})
954 = case guidance of
955 UnfIfGoodArgs {ug_size = size} -> size <= unfoldingUseThreshold opts
956 UnfWhen {} -> True
957 UnfNever -> False
958 smallEnoughToInline _ _
959 = False
960
961 {-
962 ************************************************************************
963 * *
964 \subsection{callSiteInline}
965 * *
966 ************************************************************************
967
968 This is the key function. It decides whether to inline a variable at a call site
969
970 callSiteInline is used at call sites, so it is a bit more generous.
971 It's a very important function that embodies lots of heuristics.
972 A non-WHNF can be inlined if it doesn't occur inside a lambda,
973 and occurs exactly once or
974 occurs once in each branch of a case and is small
975
976 If the thing is in WHNF, there's no danger of duplicating work,
977 so we can inline if it occurs once, or is small
978
979 NOTE: we don't want to inline top-level functions that always diverge.
980 It just makes the code bigger. Tt turns out that the convenient way to prevent
981 them inlining is to give them a NOINLINE pragma, which we do in
982 StrictAnal.addStrictnessInfoToTopId
983 -}
984
985 data ArgSummary = TrivArg -- Nothing interesting
986 | NonTrivArg -- Arg has structure
987 | ValueArg -- Arg is a con-app or PAP
988 -- ..or con-like. Note [Conlike is interesting]
989
990 instance Outputable ArgSummary where
991 ppr TrivArg = text "TrivArg"
992 ppr NonTrivArg = text "NonTrivArg"
993 ppr ValueArg = text "ValueArg"
994
995 nonTriv :: ArgSummary -> Bool
996 nonTriv TrivArg = False
997 nonTriv _ = True
998
999 data CallCtxt
1000 = BoringCtxt
1001 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1002 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1003 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1004
1005 | ValAppCtxt -- We're applied to at least one value arg
1006 -- This arises when we have ((f x |> co) y)
1007 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1008
1009 | CaseCtxt -- We're the scrutinee of a case
1010 -- that decomposes its scrutinee
1011
1012 instance Outputable CallCtxt where
1013 ppr CaseCtxt = text "CaseCtxt"
1014 ppr ValAppCtxt = text "ValAppCtxt"
1015 ppr BoringCtxt = text "BoringCtxt"
1016 ppr RhsCtxt = text "RhsCtxt"
1017 ppr DiscArgCtxt = text "DiscArgCtxt"
1018 ppr RuleArgCtxt = text "RuleArgCtxt"
1019
1020 callSiteInline :: Logger
1021 -> UnfoldingOpts
1022 -> Int -- Case depth
1023 -> Id -- The Id
1024 -> Bool -- True <=> unfolding is active
1025 -> Bool -- True if there are no arguments at all (incl type args)
1026 -> [ArgSummary] -- One for each value arg; True if it is interesting
1027 -> CallCtxt -- True <=> continuation is interesting
1028 -> Maybe CoreExpr -- Unfolding, if any
1029 callSiteInline logger opts !case_depth id active_unfolding lone_variable arg_infos cont_info
1030 = case idUnfolding id of
1031 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1032 -- Things with an INLINE pragma may have an unfolding *and*
1033 -- be a loop breaker (maybe the knot is not yet untied)
1034 CoreUnfolding { uf_tmpl = unf_template
1035 , uf_is_work_free = is_wf
1036 , uf_guidance = guidance, uf_expandable = is_exp }
1037 | active_unfolding -> tryUnfolding logger opts case_depth id lone_variable
1038 arg_infos cont_info unf_template
1039 is_wf is_exp guidance
1040 | otherwise -> traceInline logger opts id "Inactive unfolding:" (ppr id) Nothing
1041 NoUnfolding -> Nothing
1042 BootUnfolding -> Nothing
1043 OtherCon {} -> Nothing
1044 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1045
1046 -- | Report the inlining of an identifier's RHS to the user, if requested.
1047 traceInline :: Logger -> UnfoldingOpts -> Id -> String -> SDoc -> a -> a
1048 traceInline logger opts inline_id str doc result
1049 -- We take care to ensure that doc is used in only one branch, ensuring that
1050 -- the simplifier can push its allocation into the branch. See Note [INLINE
1051 -- conditional tracing utilities].
1052 | enable = logTraceMsg logger str doc result
1053 | otherwise = result
1054 where
1055 enable
1056 | logHasDumpFlag logger Opt_D_dump_verbose_inlinings
1057 = True
1058 | Just prefix <- unfoldingReportPrefix opts
1059 = prefix `isPrefixOf` occNameString (getOccName inline_id)
1060 | otherwise
1061 = False
1062 {-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities]
1063
1064 {- Note [Avoid inlining into deeply nested cases]
1065 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1066
1067 Consider a function f like this:
1068
1069 f arg1 arg2 =
1070 case ...
1071 ... -> g arg1
1072 ... -> g arg2
1073
1074 This function is small. So should be safe to inline.
1075 However sometimes this doesn't quite work out like that.
1076 Consider this code:
1077
1078 f1 arg1 arg2 ... = ...
1079 case _foo of
1080 alt1 -> ... f2 arg1 ...
1081 alt2 -> ... f2 arg2 ...
1082
1083 f2 arg1 arg2 ... = ...
1084 case _foo of
1085 alt1 -> ... f3 arg1 ...
1086 alt2 -> ... f3 arg2 ...
1087
1088 f3 arg1 arg2 ... = ...
1089
1090 ... repeats up to n times. And then f1 is
1091 applied to some arguments:
1092
1093 foo = ... f1 <interestingArgs> ...
1094
1095 Initially f2..fn are not interesting to inline so we don't.
1096 However we see that f1 is applied to interesting args.
1097 So it's an obvious choice to inline those:
1098
1099 foo =
1100 ...
1101 case _foo of
1102 alt1 -> ... f2 <interestingArg> ...
1103 alt2 -> ... f2 <interestingArg> ...
1104
1105 As a result we go and inline f2 both mentions of f2 in turn are now applied to interesting
1106 arguments and f2 is small:
1107
1108 foo =
1109 ...
1110 case _foo of
1111 alt1 -> ... case _foo of
1112 alt1 -> ... f3 <interestingArg> ...
1113 alt2 -> ... f3 <interestingArg> ...
1114
1115 alt2 -> ... case _foo of
1116 alt1 -> ... f3 <interestingArg> ...
1117 alt2 -> ... f3 <interestingArg> ...
1118
1119 The same thing happens for each binding up to f_n, duplicating the amount of inlining
1120 done in each step. Until at some point we are either done or run out of simplifier
1121 ticks/RAM. This pattern happened #18730.
1122
1123 To combat this we introduce one more heuristic when weighing inlining decision.
1124 We keep track of a "case-depth". Which increases each time we look inside a case
1125 expression with more than one alternative.
1126
1127 We then apply a penalty to inlinings based on the case-depth at which they would
1128 be inlined. Bounding the number of inlinings in such a scenario.
1129
1130 The heuristic can be tuned in two ways:
1131
1132 * We can ignore the first n levels of case nestings for inlining decisions using
1133 -funfolding-case-threshold.
1134 * The penalty grows linear with the depth. It's computed as size*(depth-threshold)/scaling.
1135 Scaling can be set with -funfolding-case-scaling.
1136
1137 Some guidance on setting these defaults:
1138
1139 * A low treshold (<= 2) is needed to prevent exponential cases from spiraling out of
1140 control. We picked 2 for no particular reason.
1141 * Scaling the penalty by any more than 30 means the reproducer from
1142 T18730 won't compile even with reasonably small values of n. Instead
1143 it will run out of runs/ticks. This means to positively affect the reproducer
1144 a scaling <= 30 is required.
1145 * A scaling of >= 15 still causes a few very large regressions on some nofib benchmarks.
1146 (+80% for gc/fulsom, +90% for real/ben-raytrace, +20% for spectral/fibheaps)
1147 * A scaling of >= 25 showed no regressions on nofib. However it showed a number of
1148 (small) regression for compiler perf benchmarks.
1149
1150 The end result is that we are settling for a scaling of 30, with a threshold of 2.
1151 This gives us minimal compiler perf regressions. No nofib runtime regressions and
1152 will still avoid this pattern sometimes. This is a "safe" default, where we err on
1153 the side of compiler blowup instead of risking runtime regressions.
1154
1155 For cases where the default falls short the flag can be changed to allow more/less inlining as
1156 needed on a per-module basis.
1157
1158 -}
1159
1160 tryUnfolding :: Logger -> UnfoldingOpts -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt
1161 -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
1162 -> Maybe CoreExpr
1163 tryUnfolding logger opts !case_depth id lone_variable
1164 arg_infos cont_info unf_template
1165 is_wf is_exp guidance
1166 = case guidance of
1167 UnfNever -> traceInline logger opts id str (text "UnfNever") Nothing
1168
1169 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1170 | enough_args && (boring_ok || some_benefit || unfoldingVeryAggressive opts)
1171 -- See Note [INLINE for small functions (3)]
1172 -> traceInline logger opts id str (mk_doc some_benefit empty True) (Just unf_template)
1173 | otherwise
1174 -> traceInline logger opts id str (mk_doc some_benefit empty False) Nothing
1175 where
1176 some_benefit = calc_some_benefit uf_arity
1177 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1178
1179 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1180 | unfoldingVeryAggressive opts
1181 -> traceInline logger opts id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1182 | is_wf && some_benefit && small_enough
1183 -> traceInline logger opts id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1184 | otherwise
1185 -> traceInline logger opts id str (mk_doc some_benefit extra_doc False) Nothing
1186 where
1187 some_benefit = calc_some_benefit (length arg_discounts)
1188 extra_doc = vcat [ text "case depth =" <+> int case_depth
1189 , text "depth based penalty =" <+> int depth_penalty
1190 , text "discounted size =" <+> int adjusted_size ]
1191 -- See Note [Avoid inlining into deeply nested cases]
1192 depth_treshold = unfoldingCaseThreshold opts
1193 depth_scaling = unfoldingCaseScaling opts
1194 depth_penalty | case_depth <= depth_treshold = 0
1195 | otherwise = (size * (case_depth - depth_treshold)) `div` depth_scaling
1196 adjusted_size = size + depth_penalty - discount
1197 small_enough = adjusted_size <= unfoldingUseThreshold opts
1198 discount = computeDiscount arg_discounts res_discount arg_infos cont_info
1199
1200 where
1201 mk_doc some_benefit extra_doc yes_or_no
1202 = vcat [ text "arg infos" <+> ppr arg_infos
1203 , text "interesting continuation" <+> ppr cont_info
1204 , text "some_benefit" <+> ppr some_benefit
1205 , text "is exp:" <+> ppr is_exp
1206 , text "is work-free:" <+> ppr is_wf
1207 , text "guidance" <+> ppr guidance
1208 , extra_doc
1209 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1210
1211 ctx = log_default_dump_context (logFlags logger)
1212 str = "Considering inlining: " ++ renderWithContext ctx (ppr id)
1213 n_val_args = length arg_infos
1214
1215 -- some_benefit is used when the RHS is small enough
1216 -- and the call has enough (or too many) value
1217 -- arguments (ie n_val_args >= arity). But there must
1218 -- be *something* interesting about some argument, or the
1219 -- result context, to make it worth inlining
1220 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1221 -- expected by the unfolding
1222 calc_some_benefit uf_arity
1223 | not saturated = interesting_args -- Under-saturated
1224 -- Note [Unsaturated applications]
1225 | otherwise = interesting_args -- Saturated or over-saturated
1226 || interesting_call
1227 where
1228 saturated = n_val_args >= uf_arity
1229 over_saturated = n_val_args > uf_arity
1230 interesting_args = any nonTriv arg_infos
1231 -- NB: (any nonTriv arg_infos) looks at the
1232 -- over-saturated args too which is "wrong";
1233 -- but if over-saturated we inline anyway.
1234
1235 interesting_call
1236 | over_saturated
1237 = True
1238 | otherwise
1239 = case cont_info of
1240 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables]
1241 ValAppCtxt -> True -- Note [Cast then apply]
1242 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1243 DiscArgCtxt -> uf_arity > 0 -- Note [Inlining in ArgCtxt]
1244 RhsCtxt -> uf_arity > 0 --
1245 _other -> False -- See Note [Nested functions]
1246
1247
1248 {-
1249 Note [Unfold into lazy contexts], Note [RHS of lets]
1250 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1251 When the call is the argument of a function with a RULE, or the RHS of a let,
1252 we are a little bit keener to inline. For example
1253 f y = (y,y,y)
1254 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1255 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1256 only we can't see it. Also
1257 x = f v
1258 could be expensive whereas
1259 x = case v of (a,b) -> a
1260 is patently cheap and may allow more eta expansion.
1261 So we treat the RHS of a let as not-totally-boring.
1262
1263 Note [Unsaturated applications]
1264 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1265 When a call is not saturated, we *still* inline if one of the
1266 arguments has interesting structure. That's sometimes very important.
1267 A good example is the Ord instance for Bool in Base:
1268
1269 Rec {
1270 $fOrdBool =GHC.Classes.D:Ord
1271 @ Bool
1272 ...
1273 $cmin_ajX
1274
1275 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1276 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1277 }
1278
1279 But the defn of GHC.Classes.$dmmin is:
1280
1281 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1282 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1283 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1284 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1285 GHC.Types.False -> y GHC.Types.True -> x }) -}
1286
1287 We *really* want to inline $dmmin, even though it has arity 3, in
1288 order to unravel the recursion.
1289
1290
1291 Note [Things to watch]
1292 ~~~~~~~~~~~~~~~~~~~~~~
1293 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1294 Assume x is exported, so not inlined unconditionally.
1295 Then we want x to inline unconditionally; no reason for it
1296 not to, and doing so avoids an indirection.
1297
1298 * { x = I# 3; ....f x.... }
1299 Make sure that x does not inline unconditionally!
1300 Lest we get extra allocation.
1301
1302 Note [Inlining an InlineRule]
1303 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1304 An InlineRules is used for
1305 (a) programmer INLINE pragmas
1306 (b) inlinings from worker/wrapper
1307
1308 For (a) the RHS may be large, and our contract is that we *only* inline
1309 when the function is applied to all the arguments on the LHS of the
1310 source-code defn. (The uf_arity in the rule.)
1311
1312 However for worker/wrapper it may be worth inlining even if the
1313 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1314 require saturation.
1315
1316 Note [Nested functions]
1317 ~~~~~~~~~~~~~~~~~~~~~~~
1318 At one time we treated a call of a non-top-level function as
1319 "interesting" (regardless of how boring the context) in the hope
1320 that inlining it would eliminate the binding, and its allocation.
1321 Specifically, in the default case of interesting_call we had
1322 _other -> not is_top && uf_arity > 0
1323
1324 But actually postInlineUnconditionally does some of this and overall
1325 it makes virtually no difference to nofib. So I simplified away this
1326 special case
1327
1328 Note [Cast then apply]
1329 ~~~~~~~~~~~~~~~~~~~~~~
1330 Consider
1331 myIndex = __inline_me ( (/\a. <blah>) |> co )
1332 co :: (forall a. a -> a) ~ (forall a. T a)
1333 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1334
1335 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1336 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1337
1338 Note [Inlining in ArgCtxt]
1339 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1340 The condition (arity > 0) here is very important, because otherwise
1341 we end up inlining top-level stuff into useless places; eg
1342 x = I# 3#
1343 f = \y. g x
1344 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1345 and 20% to 'power'.
1346
1347 At one stage I replaced this condition by 'True' (leading to the above
1348 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1349 to work ok now.
1350
1351 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1352 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1353 for the RHS of a 'let', we only profit from the inlining if we get a
1354 CONLIKE thing (modulo lets).
1355
1356 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1357 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1358 The "lone-variable" case is important. I spent ages messing about
1359 with unsatisfactory variants, but this is nice. The idea is that if a
1360 variable appears all alone
1361
1362 as an arg of lazy fn, or rhs BoringCtxt
1363 as scrutinee of a case CaseCtxt
1364 as arg of a fn ArgCtxt
1365 AND
1366 it is bound to a cheap expression
1367
1368 then we should not inline it (unless there is some other reason,
1369 e.g. it is the sole occurrence). That is what is happening at
1370 the use of 'lone_variable' in 'interesting_call'.
1371
1372 Why? At least in the case-scrutinee situation, turning
1373 let x = (a,b) in case x of y -> ...
1374 into
1375 let x = (a,b) in case (a,b) of y -> ...
1376 and thence to
1377 let x = (a,b) in let y = (a,b) in ...
1378 is bad if the binding for x will remain.
1379
1380 Another example: I discovered that strings
1381 were getting inlined straight back into applications of 'error'
1382 because the latter is strict.
1383 s = "foo"
1384 f = \x -> ...(error s)...
1385
1386 Fundamentally such contexts should not encourage inlining because, provided
1387 the RHS is "expandable" (see Note [exprIsExpandable] in GHC.Core.Utils) the
1388 context can ``see'' the unfolding of the variable (e.g. case or a
1389 RULE) so there's no gain.
1390
1391 However, watch out:
1392
1393 * Consider this:
1394 foo = _inline_ (\n. [n])
1395 bar = _inline_ (foo 20)
1396 baz = \n. case bar of { (m:_) -> m + n }
1397 Here we really want to inline 'bar' so that we can inline 'foo'
1398 and the whole thing unravels as it should obviously do. This is
1399 important: in the NDP project, 'bar' generates a closure data
1400 structure rather than a list.
1401
1402 So the non-inlining of lone_variables should only apply if the
1403 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1404 looks through the unfolding. Hence the "&& is_wf" in the
1405 InlineRule branch.
1406
1407 * Even a type application or coercion isn't a lone variable.
1408 Consider
1409 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1410 We had better inline that sucker! The case won't see through it.
1411
1412 For now, I'm treating treating a variable applied to types
1413 in a *lazy* context "lone". The motivating example was
1414 f = /\a. \x. BIG
1415 g = /\a. \y. h (f a)
1416 There's no advantage in inlining f here, and perhaps
1417 a significant disadvantage. Hence some_val_args in the Stop case
1418
1419 Note [Interaction of exprIsWorkFree and lone variables]
1420 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1421 The lone-variable test says "don't inline if a case expression
1422 scrutinises a lone variable whose unfolding is cheap". It's very
1423 important that, under these circumstances, exprIsConApp_maybe
1424 can spot a constructor application. So, for example, we don't
1425 consider
1426 let x = e in (x,x)
1427 to be cheap, and that's good because exprIsConApp_maybe doesn't
1428 think that expression is a constructor application.
1429
1430 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1431 rather than is_wf, which was utterly wrong, because the above
1432 expression responds True to exprIsHNF, which is what sets is_value.
1433
1434 This kind of thing can occur if you have
1435
1436 {-# INLINE foo #-}
1437 foo = let x = e in (x,x)
1438
1439 which Roman did.
1440
1441
1442 -}
1443
1444 computeDiscount :: [Int] -> Int -> [ArgSummary] -> CallCtxt
1445 -> Int
1446 computeDiscount arg_discounts res_discount arg_infos cont_info
1447
1448 = 10 -- Discount of 10 because the result replaces the call
1449 -- so we count 10 for the function itself
1450
1451 + 10 * length actual_arg_discounts
1452 -- Discount of 10 for each arg supplied,
1453 -- because the result replaces the call
1454
1455 + total_arg_discount + res_discount'
1456 where
1457 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1458 total_arg_discount = sum actual_arg_discounts
1459
1460 mk_arg_discount _ TrivArg = 0
1461 mk_arg_discount _ NonTrivArg = 10
1462 mk_arg_discount discount ValueArg = discount
1463
1464 res_discount'
1465 | LT <- arg_discounts `compareLength` arg_infos
1466 = res_discount -- Over-saturated
1467 | otherwise
1468 = case cont_info of
1469 BoringCtxt -> 0
1470 CaseCtxt -> res_discount -- Presumably a constructor
1471 ValAppCtxt -> res_discount -- Presumably a function
1472 _ -> 40 `min` res_discount
1473 -- ToDo: this 40 `min` res_discount doesn't seem right
1474 -- for DiscArgCtxt it shouldn't matter because the function will
1475 -- get the arg discount for any non-triv arg
1476 -- for RuleArgCtxt we do want to be keener to inline; but not only
1477 -- constructor results
1478 -- for RhsCtxt I suppose that exposing a data con is good in general
1479 -- And 40 seems very arbitrary
1480 --
1481 -- res_discount can be very large when a function returns
1482 -- constructors; but we only want to invoke that large discount
1483 -- when there's a case continuation.
1484 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1485 -- But we want to avoid inlining large functions that return
1486 -- constructors into contexts that are simply "interesting"