never executed always true always false

    1 {-
    2 (c) The University of Glasgow 2006
    3 (c) The AQUA Project, Glasgow University, 1994-1998
    4 
    5 
    6 Core-syntax unfoldings
    7 
    8 Unfoldings (which can travel across module boundaries) are in Core
    9 syntax (namely @CoreExpr@s).
   10 
   11 The type @Unfolding@ sits ``above'' simply-Core-expressions
   12 unfoldings, capturing ``higher-level'' things we know about a binding,
   13 usually things that the simplifier found out (e.g., ``it's a
   14 literal'').  In the corner of a @CoreUnfolding@ unfolding, you will
   15 find, unsurprisingly, a Core expression.
   16 -}
   17 
   18 
   19 {-# LANGUAGE BangPatterns #-}
   20 
   21 module GHC.Core.Unfold (
   22         Unfolding, UnfoldingGuidance,   -- Abstract types
   23 
   24         UnfoldingOpts (..), defaultUnfoldingOpts,
   25         updateCreationThreshold, updateUseThreshold,
   26         updateFunAppDiscount, updateDictDiscount,
   27         updateVeryAggressive, updateCaseScaling,
   28         updateCaseThreshold, updateReportPrefix,
   29 
   30         ArgSummary(..),
   31 
   32         couldBeSmallEnoughToInline, inlineBoringOk,
   33         smallEnoughToInline,
   34 
   35         callSiteInline, CallCtxt(..),
   36         calcUnfoldingGuidance
   37     ) where
   38 
   39 import GHC.Prelude
   40 
   41 import GHC.Driver.Flags
   42 
   43 import GHC.Core
   44 import GHC.Core.Utils
   45 import GHC.Types.Id
   46 import GHC.Core.DataCon
   47 import GHC.Types.Literal
   48 import GHC.Builtin.PrimOps
   49 import GHC.Types.Id.Info
   50 import GHC.Types.Basic  ( Arity )
   51 import GHC.Core.Type
   52 import GHC.Builtin.Names
   53 import GHC.Builtin.Types.Prim ( realWorldStatePrimTy )
   54 import GHC.Data.Bag
   55 import GHC.Utils.Logger
   56 import GHC.Utils.Misc
   57 import GHC.Utils.Outputable
   58 import GHC.Types.ForeignCall
   59 import GHC.Types.Name
   60 import GHC.Types.Tickish
   61 
   62 import qualified Data.ByteString as BS
   63 import Data.List (isPrefixOf)
   64 
   65 
   66 -- | Unfolding options
   67 data UnfoldingOpts = UnfoldingOpts
   68    { unfoldingCreationThreshold :: !Int
   69       -- ^ Threshold above which unfoldings are not *created*
   70 
   71    , unfoldingUseThreshold :: !Int
   72       -- ^ Threshold above which unfoldings are not *inlined*
   73 
   74    , unfoldingFunAppDiscount :: !Int
   75       -- ^ Discount for lambdas that are used (applied)
   76 
   77    , unfoldingDictDiscount :: !Int
   78       -- ^ Discount for dictionaries
   79 
   80    , unfoldingVeryAggressive :: !Bool
   81       -- ^ Force inlining in many more cases
   82 
   83    , unfoldingCaseThreshold :: !Int
   84       -- ^ Don't consider depth up to x
   85 
   86    , unfoldingCaseScaling :: !Int
   87       -- ^ Penalize depth with 1/x
   88 
   89    , unfoldingReportPrefix :: !(Maybe String)
   90       -- ^ Only report inlining decisions for names with this prefix
   91    }
   92 
   93 defaultUnfoldingOpts :: UnfoldingOpts
   94 defaultUnfoldingOpts = UnfoldingOpts
   95    { unfoldingCreationThreshold = 750
   96       -- The unfoldingCreationThreshold threshold must be reasonably high
   97       -- to take account of possible discounts.
   98       -- E.g. 450 is not enough in 'fulsom' for Interval.sqr to
   99       -- inline into Csg.calc (The unfolding for sqr never makes it
  100       -- into the interface file.)
  101 
  102    , unfoldingUseThreshold   = 90
  103       -- Last adjusted upwards in #18282, when I reduced
  104       -- the result discount for constructors.
  105 
  106    , unfoldingFunAppDiscount = 60
  107       -- Be fairly keen to inline a function if that means
  108       -- we'll be able to pick the right method from a dictionary
  109 
  110    , unfoldingDictDiscount   = 30
  111       -- Be fairly keen to inline a function if that means
  112       -- we'll be able to pick the right method from a dictionary
  113 
  114    , unfoldingVeryAggressive = False
  115 
  116       -- Only apply scaling once we are deeper than threshold cases
  117       -- in an RHS.
  118    , unfoldingCaseThreshold = 2
  119 
  120       -- Penalize depth with (size*depth)/scaling
  121    , unfoldingCaseScaling = 30
  122 
  123       -- Don't filter inlining decision reports
  124    , unfoldingReportPrefix = Nothing
  125    }
  126 
  127 -- Helpers for "GHC.Driver.Session"
  128 
  129 updateCreationThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
  130 updateCreationThreshold n opts = opts { unfoldingCreationThreshold = n }
  131 
  132 updateUseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
  133 updateUseThreshold n opts = opts { unfoldingUseThreshold = n }
  134 
  135 updateFunAppDiscount :: Int -> UnfoldingOpts -> UnfoldingOpts
  136 updateFunAppDiscount n opts = opts { unfoldingFunAppDiscount = n }
  137 
  138 updateDictDiscount :: Int -> UnfoldingOpts -> UnfoldingOpts
  139 updateDictDiscount n opts = opts { unfoldingDictDiscount = n }
  140 
  141 updateVeryAggressive :: Bool -> UnfoldingOpts -> UnfoldingOpts
  142 updateVeryAggressive n opts = opts { unfoldingVeryAggressive = n }
  143 
  144 
  145 updateCaseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
  146 updateCaseThreshold n opts = opts { unfoldingCaseThreshold = n }
  147 
  148 updateCaseScaling :: Int -> UnfoldingOpts -> UnfoldingOpts
  149 updateCaseScaling n opts = opts { unfoldingCaseScaling = n }
  150 
  151 updateReportPrefix :: Maybe String -> UnfoldingOpts -> UnfoldingOpts
  152 updateReportPrefix n opts = opts { unfoldingReportPrefix = n }
  153 
  154 {-
  155 Note [Occurrence analysis of unfoldings]
  156 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  157 We do occurrence-analysis of unfoldings once and for all, when the
  158 unfolding is built, rather than each time we inline them.
  159 
  160 But given this decision it's vital that we do
  161 *always* do it.  Consider this unfolding
  162     \x -> letrec { f = ...g...; g* = f } in body
  163 where g* is (for some strange reason) the loop breaker.  If we don't
  164 occ-anal it when reading it in, we won't mark g as a loop breaker, and
  165 we may inline g entirely in body, dropping its binding, and leaving
  166 the occurrence in f out of scope. This happened in #8892, where
  167 the unfolding in question was a DFun unfolding.
  168 
  169 But more generally, the simplifier is designed on the
  170 basis that it is looking at occurrence-analysed expressions, so better
  171 ensure that they actually are.
  172 
  173 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
  174 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  175 Notice that we give the non-occur-analysed expression to
  176 calcUnfoldingGuidance.  In some ways it'd be better to occur-analyse
  177 first; for example, sometimes during simplification, there's a large
  178 let-bound thing which has been substituted, and so is now dead; so
  179 'expr' contains two copies of the thing while the occurrence-analysed
  180 expression doesn't.
  181 
  182 Nevertheless, we *don't* and *must not* occ-analyse before computing
  183 the size because
  184 
  185 a) The size computation bales out after a while, whereas occurrence
  186    analysis does not.
  187 
  188 b) Residency increases sharply if you occ-anal first.  I'm not
  189    100% sure why, but it's a large effect.  Compiling Cabal went
  190    from residency of 534M to over 800M with this one change.
  191 
  192 This can occasionally mean that the guidance is very pessimistic;
  193 it gets fixed up next round.  And it should be rare, because large
  194 let-bound things that are dead are usually caught by preInlineUnconditionally
  195 
  196 
  197 ************************************************************************
  198 *                                                                      *
  199 \subsection{The UnfoldingGuidance type}
  200 *                                                                      *
  201 ************************************************************************
  202 -}
  203 
  204 inlineBoringOk :: CoreExpr -> Bool
  205 -- See Note [INLINE for small functions]
  206 -- True => the result of inlining the expression is
  207 --         no bigger than the expression itself
  208 --     eg      (\x y -> f y x)
  209 -- This is a quick and dirty version. It doesn't attempt
  210 -- to deal with  (\x y z -> x (y z))
  211 -- The really important one is (x `cast` c)
  212 inlineBoringOk e
  213   = go 0 e
  214   where
  215     go :: Int -> CoreExpr -> Bool
  216     go credit (Lam x e) | isId x           = go (credit+1) e
  217                         | otherwise        = go credit e
  218         -- See Note [Count coercion arguments in boring contexts]
  219     go credit (App f (Type {}))            = go credit f
  220     go credit (App f a) | credit > 0
  221                         , exprIsTrivial a  = go (credit-1) f
  222     go credit (Tick _ e)                   = go credit e -- dubious
  223     go credit (Cast e _)                   = go credit e
  224     go credit (Case scrut _ _ [Alt _ _ rhs]) -- See Note [Inline unsafeCoerce]
  225       | isUnsafeEqualityProof scrut        = go credit rhs
  226     go _      (Var {})                     = boringCxtOk
  227     go _      _                            = boringCxtNotOk
  228 
  229 calcUnfoldingGuidance
  230         :: UnfoldingOpts
  231         -> Bool          -- Definitely a top-level, bottoming binding
  232         -> CoreExpr      -- Expression to look at
  233         -> UnfoldingGuidance
  234 calcUnfoldingGuidance opts is_top_bottoming (Tick t expr)
  235   | not (tickishIsCode t)  -- non-code ticks don't matter for unfolding
  236   = calcUnfoldingGuidance opts is_top_bottoming expr
  237 calcUnfoldingGuidance opts is_top_bottoming expr
  238   = case sizeExpr opts bOMB_OUT_SIZE val_bndrs body of
  239       TooBig -> UnfNever
  240       SizeIs size cased_bndrs scrut_discount
  241         | uncondInline expr n_val_bndrs size
  242         -> UnfWhen { ug_unsat_ok = unSaturatedOk
  243                    , ug_boring_ok =  boringCxtOk
  244                    , ug_arity = n_val_bndrs }   -- Note [INLINE for small functions]
  245 
  246         | is_top_bottoming
  247         -> UnfNever   -- See Note [Do not inline top-level bottoming functions]
  248 
  249         | otherwise
  250         -> UnfIfGoodArgs { ug_args  = map (mk_discount cased_bndrs) val_bndrs
  251                          , ug_size  = size
  252                          , ug_res   = scrut_discount }
  253 
  254   where
  255     (bndrs, body) = collectBinders expr
  256     bOMB_OUT_SIZE = unfoldingCreationThreshold opts
  257            -- Bomb out if size gets bigger than this
  258     val_bndrs   = filter isId bndrs
  259     n_val_bndrs = length val_bndrs
  260 
  261     mk_discount :: Bag (Id,Int) -> Id -> Int
  262     mk_discount cbs bndr = foldl' combine 0 cbs
  263            where
  264              combine acc (bndr', disc)
  265                | bndr == bndr' = acc `plus_disc` disc
  266                | otherwise     = acc
  267 
  268              plus_disc :: Int -> Int -> Int
  269              plus_disc | isFunTy (idType bndr) = max
  270                        | otherwise             = (+)
  271              -- See Note [Function and non-function discounts]
  272 
  273 {- Note [Inline unsafeCoerce]
  274 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  275 We really want to inline unsafeCoerce, even when applied to boring
  276 arguments.  It doesn't look as if its RHS is smaller than the call
  277    unsafeCoerce x = case unsafeEqualityProof @a @b of UnsafeRefl -> x
  278 but that case is discarded -- see Note [Implementing unsafeCoerce]
  279 in base:Unsafe.Coerce.
  280 
  281 Moreover, if we /don't/ inline it, we may be left with
  282           f (unsafeCoerce x)
  283 which will build a thunk -- bad, bad, bad.
  284 
  285 Conclusion: we really want inlineBoringOk to be True of the RHS of
  286 unsafeCoerce.  This is (U4) in Note [Implementing unsafeCoerce].
  287 
  288 Note [Computing the size of an expression]
  289 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  290 The basic idea of sizeExpr is obvious enough: count nodes.  But getting the
  291 heuristics right has taken a long time.  Here's the basic strategy:
  292 
  293     * Variables, literals: 0
  294       (Exception for string literals, see litSize.)
  295 
  296     * Function applications (f e1 .. en): 1 + #value args
  297 
  298     * Constructor applications: 1, regardless of #args
  299 
  300     * Let(rec): 1 + size of components
  301 
  302     * Note, cast: 0
  303 
  304 Examples
  305 
  306   Size  Term
  307   --------------
  308     0     42#
  309     0     x
  310     0     True
  311     2     f x
  312     1     Just x
  313     4     f (g x)
  314 
  315 Notice that 'x' counts 0, while (f x) counts 2.  That's deliberate: there's
  316 a function call to account for.  Notice also that constructor applications
  317 are very cheap, because exposing them to a caller is so valuable.
  318 
  319 [25/5/11] All sizes are now multiplied by 10, except for primops
  320 (which have sizes like 1 or 4.  This makes primops look fantastically
  321 cheap, and seems to be almost universally beneficial.  Done partly as a
  322 result of #4978.
  323 
  324 Note [Do not inline top-level bottoming functions]
  325 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  326 The FloatOut pass has gone to some trouble to float out calls to 'error'
  327 and similar friends.  See Note [Bottoming floats] in GHC.Core.Opt.SetLevels.
  328 Do not re-inline them!  But we *do* still inline if they are very small
  329 (the uncondInline stuff).
  330 
  331 Note [INLINE for small functions]
  332 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  333 Consider        {-# INLINE f #-}
  334                 f x = Just x
  335                 g y = f y
  336 Then f's RHS is no larger than its LHS, so we should inline it into
  337 even the most boring context.  In general, f the function is
  338 sufficiently small that its body is as small as the call itself, the
  339 inline unconditionally, regardless of how boring the context is.
  340 
  341 Things to note:
  342 
  343 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
  344     than the thing it's replacing.  Notice that
  345       (f x) --> (g 3)             -- YES, unconditionally
  346       (f x) --> x : []            -- YES, *even though* there are two
  347                                   --      arguments to the cons
  348       x     --> g 3               -- NO
  349       x     --> Just v            -- NO
  350 
  351     It's very important not to unconditionally replace a variable by
  352     a non-atomic term.
  353 
  354 (2) We do this even if the thing isn't saturated, else we end up with the
  355     silly situation that
  356        f x y = x
  357        ...map (f 3)...
  358     doesn't inline.  Even in a boring context, inlining without being
  359     saturated will give a lambda instead of a PAP, and will be more
  360     efficient at runtime.
  361 
  362 (3) However, when the function's arity > 0, we do insist that it
  363     has at least one value argument at the call site.  (This check is
  364     made in the UnfWhen case of callSiteInline.) Otherwise we find this:
  365          f = /\a \x:a. x
  366          d = /\b. MkD (f b)
  367     If we inline f here we get
  368          d = /\b. MkD (\x:b. x)
  369     and then prepareRhs floats out the argument, abstracting the type
  370     variables, so we end up with the original again!
  371 
  372 (4) We must be much more cautious about arity-zero things. Consider
  373        let x = y +# z in ...
  374     In *size* terms primops look very small, because the generate a
  375     single instruction, but we do not want to unconditionally replace
  376     every occurrence of x with (y +# z).  So we only do the
  377     unconditional-inline thing for *trivial* expressions.
  378 
  379     NB: you might think that PostInlineUnconditionally would do this
  380     but it doesn't fire for top-level things; see GHC.Core.Opt.Simplify.Utils
  381     Note [Top level and postInlineUnconditionally]
  382 
  383 Note [Count coercion arguments in boring contexts]
  384 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  385 In inlineBoringOK, we ignore type arguments when deciding whether an
  386 expression is okay to inline into boring contexts. This is good, since
  387 if we have a definition like
  388 
  389   let y = x @Int in f y y
  390 
  391 there’s no reason not to inline y at both use sites — no work is
  392 actually duplicated. It may seem like the same reasoning applies to
  393 coercion arguments, and indeed, in #17182 we changed inlineBoringOK to
  394 treat coercions the same way.
  395 
  396 However, this isn’t a good idea: unlike type arguments, which have
  397 no runtime representation, coercion arguments *do* have a runtime
  398 representation (albeit the zero-width VoidRep, see Note [Coercion tokens]
  399 in "GHC.CoreToStg"). This caused trouble in #17787 for DataCon wrappers for
  400 nullary GADT constructors: the wrappers would be inlined and each use of
  401 the constructor would lead to a separate allocation instead of just
  402 sharing the wrapper closure.
  403 
  404 The solution: don’t ignore coercion arguments after all.
  405 -}
  406 
  407 uncondInline :: CoreExpr -> Arity -> Int -> Bool
  408 -- Inline unconditionally if there no size increase
  409 -- Size of call is arity (+1 for the function)
  410 -- See Note [INLINE for small functions]
  411 uncondInline rhs arity size
  412   | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
  413   | otherwise = exprIsTrivial rhs        -- See Note [INLINE for small functions] (4)
  414 
  415 sizeExpr :: UnfoldingOpts
  416          -> Int             -- Bomb out if it gets bigger than this
  417          -> [Id]            -- Arguments; we're interested in which of these
  418                             -- get case'd
  419          -> CoreExpr
  420          -> ExprSize
  421 
  422 -- Note [Computing the size of an expression]
  423 
  424 -- Forcing bOMB_OUT_SIZE early prevents repeated
  425 -- unboxing of the Int argument.
  426 sizeExpr opts !bOMB_OUT_SIZE top_args expr
  427   = size_up expr
  428   where
  429     size_up (Cast e _) = size_up e
  430     size_up (Tick _ e) = size_up e
  431     size_up (Type _)   = sizeZero           -- Types cost nothing
  432     size_up (Coercion _) = sizeZero
  433     size_up (Lit lit)  = sizeN (litSize lit)
  434     size_up (Var f) | isRealWorldId f = sizeZero
  435                       -- Make sure we get constructor discounts even
  436                       -- on nullary constructors
  437                     | otherwise       = size_up_call f [] 0
  438 
  439     size_up (App fun arg)
  440       | isTyCoArg arg = size_up fun
  441       | otherwise     = size_up arg  `addSizeNSD`
  442                         size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
  443 
  444     size_up (Lam b e)
  445       | isId b && not (isRealWorldId b) = lamScrutDiscount opts (size_up e `addSizeN` 10)
  446       | otherwise = size_up e
  447 
  448     size_up (Let (NonRec binder rhs) body)
  449       = size_up_rhs (binder, rhs) `addSizeNSD`
  450         size_up body              `addSizeN`
  451         size_up_alloc binder
  452 
  453     size_up (Let (Rec pairs) body)
  454       = foldr (addSizeNSD . size_up_rhs)
  455               (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
  456               pairs
  457 
  458     size_up (Case e _ _ alts)
  459         | null alts
  460         = size_up e    -- case e of {} never returns, so take size of scrutinee
  461 
  462     size_up (Case e _ _ alts)
  463         -- Now alts is non-empty
  464         | Just v <- is_top_arg e -- We are scrutinising an argument variable
  465         = let
  466             alt_sizes = map size_up_alt alts
  467 
  468                   -- alts_size tries to compute a good discount for
  469                   -- the case when we are scrutinising an argument variable
  470             alts_size (SizeIs tot tot_disc tot_scrut)
  471                           -- Size of all alternatives
  472                       (SizeIs max _        _)
  473                           -- Size of biggest alternative
  474                   = SizeIs tot (unitBag (v, 20 + tot - max)
  475                       `unionBags` tot_disc) tot_scrut
  476                           -- If the variable is known, we produce a
  477                           -- discount that will take us back to 'max',
  478                           -- the size of the largest alternative The
  479                           -- 1+ is a little discount for reduced
  480                           -- allocation in the caller
  481                           --
  482                           -- Notice though, that we return tot_disc,
  483                           -- the total discount from all branches.  I
  484                           -- think that's right.
  485 
  486             alts_size tot_size _ = tot_size
  487           in
  488           alts_size (foldr1 addAltSize alt_sizes)  -- alts is non-empty
  489                     (foldr1 maxSize    alt_sizes)
  490                 -- Good to inline if an arg is scrutinised, because
  491                 -- that may eliminate allocation in the caller
  492                 -- And it eliminates the case itself
  493         where
  494           is_top_arg (Var v) | v `elem` top_args = Just v
  495           is_top_arg (Cast e _) = is_top_arg e
  496           is_top_arg _ = Nothing
  497 
  498 
  499     size_up (Case e _ _ alts) = size_up e  `addSizeNSD`
  500                                 foldr (addAltSize . size_up_alt) case_size alts
  501       where
  502           case_size
  503            | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
  504            | otherwise = sizeZero
  505                 -- Normally we don't charge for the case itself, but
  506                 -- we charge one per alternative (see size_up_alt,
  507                 -- below) to account for the cost of the info table
  508                 -- and comparisons.
  509                 --
  510                 -- However, in certain cases (see is_inline_scrut
  511                 -- below), no code is generated for the case unless
  512                 -- there are multiple alts.  In these cases we
  513                 -- subtract one, making the first alt free.
  514                 -- e.g. case x# +# y# of _ -> ...   should cost 1
  515                 --      case touch# x# of _ -> ...  should cost 0
  516                 -- (see #4978)
  517                 --
  518                 -- I would like to not have the "lengthAtMost alts 1"
  519                 -- condition above, but without that some programs got worse
  520                 -- (spectral/hartel/event and spectral/para).  I don't fully
  521                 -- understand why. (SDM 24/5/11)
  522 
  523                 -- unboxed variables, inline primops and unsafe foreign calls
  524                 -- are all "inline" things:
  525           is_inline_scrut (Var v) = isUnliftedType (idType v)
  526           is_inline_scrut scrut
  527               | (Var f, _) <- collectArgs scrut
  528                 = case idDetails f of
  529                     FCallId fc  -> not (isSafeForeignCall fc)
  530                     PrimOpId op -> not (primOpOutOfLine op)
  531                     _other      -> False
  532               | otherwise
  533                 = False
  534 
  535     size_up_rhs (bndr, rhs)
  536       | Just join_arity <- isJoinId_maybe bndr
  537         -- Skip arguments to join point
  538       , (_bndrs, body) <- collectNBinders join_arity rhs
  539       = size_up body
  540       | otherwise
  541       = size_up rhs
  542 
  543     ------------
  544     -- size_up_app is used when there's ONE OR MORE value args
  545     size_up_app (App fun arg) args voids
  546         | isTyCoArg arg                  = size_up_app fun args voids
  547         | isRealWorldExpr arg            = size_up_app fun (arg:args) (voids + 1)
  548         | otherwise                      = size_up arg  `addSizeNSD`
  549                                            size_up_app fun (arg:args) voids
  550     size_up_app (Var fun)     args voids = size_up_call fun args voids
  551     size_up_app (Tick _ expr) args voids = size_up_app expr args voids
  552     size_up_app (Cast expr _) args voids = size_up_app expr args voids
  553     size_up_app other         args voids = size_up other `addSizeN`
  554                                            callSize (length args) voids
  555        -- if the lhs is not an App or a Var, or an invisible thing like a
  556        -- Tick or Cast, then we should charge for a complete call plus the
  557        -- size of the lhs itself.
  558 
  559     ------------
  560     size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
  561     size_up_call fun val_args voids
  562        = case idDetails fun of
  563            FCallId _        -> sizeN (callSize (length val_args) voids)
  564            DataConWorkId dc -> conSize    dc (length val_args)
  565            PrimOpId op      -> primOpSize op (length val_args)
  566            ClassOpId _      -> classOpSize opts top_args val_args
  567            _                -> funSize opts top_args fun (length val_args) voids
  568 
  569     ------------
  570     size_up_alt (Alt _con _bndrs rhs) = size_up rhs `addSizeN` 10
  571         -- Don't charge for args, so that wrappers look cheap
  572         -- (See comments about wrappers with Case)
  573         --
  574         -- IMPORTANT: *do* charge 1 for the alternative, else we
  575         -- find that giant case nests are treated as practically free
  576         -- A good example is Foreign.C.Error.errnoToIOError
  577 
  578     ------------
  579     -- Cost to allocate binding with given binder
  580     size_up_alloc bndr
  581       |  isTyVar bndr                 -- Doesn't exist at runtime
  582       || isJoinId bndr                -- Not allocated at all
  583       || isUnliftedType (idType bndr) -- Doesn't live in heap
  584       = 0
  585       | otherwise
  586       = 10
  587 
  588     ------------
  589         -- These addSize things have to be here because
  590         -- I don't want to give them bOMB_OUT_SIZE as an argument
  591     addSizeN TooBig          _  = TooBig
  592     addSizeN (SizeIs n xs d) m  = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
  593 
  594         -- addAltSize is used to add the sizes of case alternatives
  595     addAltSize TooBig            _      = TooBig
  596     addAltSize _                 TooBig = TooBig
  597     addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
  598         = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
  599                                  (xs `unionBags` ys)
  600                                  (d1 + d2) -- Note [addAltSize result discounts]
  601 
  602         -- This variant ignores the result discount from its LEFT argument
  603         -- It's used when the second argument isn't part of the result
  604     addSizeNSD TooBig            _      = TooBig
  605     addSizeNSD _                 TooBig = TooBig
  606     addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
  607         = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
  608                                  (xs `unionBags` ys)
  609                                  d2  -- Ignore d1
  610 
  611     isRealWorldId id = idType id `eqType` realWorldStatePrimTy
  612 
  613     -- an expression of type State# RealWorld must be a variable
  614     isRealWorldExpr (Var id)   = isRealWorldId id
  615     isRealWorldExpr (Tick _ e) = isRealWorldExpr e
  616     isRealWorldExpr _          = False
  617 
  618 -- | Finds a nominal size of a string literal.
  619 litSize :: Literal -> Int
  620 -- Used by GHC.Core.Unfold.sizeExpr
  621 litSize (LitNumber LitNumBigNat _)  = 100
  622 litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4)
  623         -- If size could be 0 then @f "x"@ might be too small
  624         -- [Sept03: make literal strings a bit bigger to avoid fruitless
  625         --  duplication of little strings]
  626 litSize _other = 0    -- Must match size of nullary constructors
  627                       -- Key point: if  x |-> 4, then x must inline unconditionally
  628                       --            (eg via case binding)
  629 
  630 classOpSize :: UnfoldingOpts -> [Id] -> [CoreExpr] -> ExprSize
  631 -- See Note [Conlike is interesting]
  632 classOpSize _ _ []
  633   = sizeZero
  634 classOpSize opts top_args (arg1 : other_args)
  635   = SizeIs size arg_discount 0
  636   where
  637     size = 20 + (10 * length other_args)
  638     -- If the class op is scrutinising a lambda bound dictionary then
  639     -- give it a discount, to encourage the inlining of this function
  640     -- The actual discount is rather arbitrarily chosen
  641     arg_discount = case arg1 of
  642                      Var dict | dict `elem` top_args
  643                               -> unitBag (dict, unfoldingDictDiscount opts)
  644                      _other   -> emptyBag
  645 
  646 -- | The size of a function call
  647 callSize
  648  :: Int  -- ^ number of value args
  649  -> Int  -- ^ number of value args that are void
  650  -> Int
  651 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
  652         -- The 1+ is for the function itself
  653         -- Add 1 for each non-trivial arg;
  654         -- the allocation cost, as in let(rec)
  655 
  656 -- | The size of a jump to a join point
  657 jumpSize
  658  :: Int  -- ^ number of value args
  659  -> Int  -- ^ number of value args that are void
  660  -> Int
  661 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
  662   -- A jump is 20% the size of a function call. Making jumps free reopens
  663   -- bug #6048, but making them any more expensive loses a 21% improvement in
  664   -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
  665   -- better solution?
  666 
  667 funSize :: UnfoldingOpts -> [Id] -> Id -> Int -> Int -> ExprSize
  668 -- Size for functions that are not constructors or primops
  669 -- Note [Function applications]
  670 funSize opts top_args fun n_val_args voids
  671   | fun `hasKey` buildIdKey   = buildSize
  672   | fun `hasKey` augmentIdKey = augmentSize
  673   | otherwise = SizeIs size arg_discount res_discount
  674   where
  675     some_val_args = n_val_args > 0
  676     is_join = isJoinId fun
  677 
  678     size | is_join              = jumpSize n_val_args voids
  679          | not some_val_args    = 0
  680          | otherwise            = callSize n_val_args voids
  681 
  682         --                  DISCOUNTS
  683         --  See Note [Function and non-function discounts]
  684     arg_discount | some_val_args && fun `elem` top_args
  685                  = unitBag (fun, unfoldingFunAppDiscount opts)
  686                  | otherwise = emptyBag
  687         -- If the function is an argument and is applied
  688         -- to some values, give it an arg-discount
  689 
  690     res_discount | idArity fun > n_val_args = unfoldingFunAppDiscount opts
  691                  | otherwise                = 0
  692         -- If the function is partially applied, show a result discount
  693 -- XXX maybe behave like ConSize for eval'd variable
  694 
  695 conSize :: DataCon -> Int -> ExprSize
  696 conSize dc n_val_args
  697   | n_val_args == 0 = SizeIs 0 emptyBag 10    -- Like variables
  698 
  699 -- See Note [Unboxed tuple size and result discount]
  700   | isUnboxedTupleDataCon dc = SizeIs 0 emptyBag 10
  701 
  702 -- See Note [Constructor size and result discount]
  703   | otherwise = SizeIs 10 emptyBag 10
  704 
  705 {- Note [Constructor size and result discount]
  706 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  707 Treat a constructors application as size 10, regardless of how many
  708 arguments it has; we are keen to expose them (and we charge separately
  709 for their args).  We can't treat them as size zero, else we find that
  710 (Just x) has size 0, which is the same as a lone variable; and hence
  711 'v' will always be replaced by (Just x), where v is bound to Just x.
  712 
  713 The "result discount" is applied if the result of the call is
  714 scrutinised (say by a case).  For a constructor application that will
  715 mean the constructor application will disappear, so we don't need to
  716 charge it to the function.  So the discount should at least match the
  717 cost of the constructor application, namely 10.
  718 
  719 Historical note 1: Until Jun 2020 we gave it a "bit of extra
  720 incentive" via a discount of 10*(1 + n_val_args), but that was FAR too
  721 much (#18282).  In particular, consider a huge case tree like
  722 
  723    let r = case y1 of
  724           Nothing -> B1 a b c
  725           Just v1 -> case y2 of
  726                       Nothing -> B1 c b a
  727                       Just v2 -> ...
  728 
  729 If conSize gives a cost of 10 (regardless of n_val_args) and a
  730 discount of 10, that'll make each alternative RHS cost zero.  We
  731 charge 10 for each case alternative (see size_up_alt).  If we give a
  732 bigger discount (say 20) in conSize, we'll make the case expression
  733 cost *nothing*, and that can make a huge case tree cost nothing. This
  734 leads to massive, sometimes exponenial inlinings (#18282).  In short,
  735 don't give a discount that give a negative size to a sub-expression!
  736 
  737 Historical note 2: Much longer ago, Simon M tried a MUCH bigger
  738 discount: (10 * (10 + n_val_args)), and said it was an "unambiguous
  739 win", but its terribly dangerous because a function with many many
  740 case branches, each finishing with a constructor, can have an
  741 arbitrarily large discount.  This led to terrible code bloat: see #6099.
  742 
  743 Note [Unboxed tuple size and result discount]
  744 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  745 However, unboxed tuples count as size zero. I found occasions where we had
  746         f x y z = case op# x y z of { s -> (# s, () #) }
  747 and f wasn't getting inlined.
  748 
  749 I tried giving unboxed tuples a *result discount* of zero (see the
  750 commented-out line).  Why?  When returned as a result they do not
  751 allocate, so maybe we don't want to charge so much for them. If you
  752 have a non-zero discount here, we find that workers often get inlined
  753 back into wrappers, because it look like
  754     f x = case $wf x of (# a,b #) -> (a,b)
  755 and we are keener because of the case.  However while this change
  756 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
  757 more. All other changes were very small. So it's not a big deal but I
  758 didn't adopt the idea.
  759 
  760 When fixing #18282 (see Note [Constructor size and result discount])
  761 I changed the result discount to be just 10, not 10*(1+n_val_args).
  762 
  763 Note [Function and non-function discounts]
  764 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  765 We want a discount if the function is applied. A good example is
  766 monadic combinators with continuation arguments, where inlining is
  767 quite important.
  768 
  769 But we don't want a big discount when a function is called many times
  770 (see the detailed comments with #6048) because if the function is
  771 big it won't be inlined at its many call sites and no benefit results.
  772 Indeed, we can get exponentially big inlinings this way; that is what
  773 #6048 is about.
  774 
  775 On the other hand, for data-valued arguments, if there are lots of
  776 case expressions in the body, each one will get smaller if we apply
  777 the function to a constructor application, so we *want* a big discount
  778 if the argument is scrutinised by many case expressions.
  779 
  780 Conclusion:
  781   - For functions, take the max of the discounts
  782   - For data values, take the sum of the discounts
  783 
  784 
  785 Note [Literal integer size]
  786 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
  787 Literal integers *can* be big (mkInteger [...coefficients...]), but
  788 need not be (IS n).  We just use an arbitrary big-ish constant here
  789 so that, in particular, we don't inline top-level defns like
  790    n = IS 5
  791 There's no point in doing so -- any optimisations will see the IS
  792 through n's unfolding.  Nor will a big size inhibit unfoldings functions
  793 that mention a literal Integer, because the float-out pass will float
  794 all those constants to top level.
  795 -}
  796 
  797 primOpSize :: PrimOp -> Int -> ExprSize
  798 primOpSize op n_val_args
  799  = if primOpOutOfLine op
  800       then sizeN (op_size + n_val_args)
  801       else sizeN op_size
  802  where
  803    op_size = primOpCodeSize op
  804 
  805 
  806 buildSize :: ExprSize
  807 buildSize = SizeIs 0 emptyBag 40
  808         -- We really want to inline applications of build
  809         -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
  810         -- Indeed, we should add a result_discount because build is
  811         -- very like a constructor.  We don't bother to check that the
  812         -- build is saturated (it usually is).  The "-2" discounts for the \c n,
  813         -- The "4" is rather arbitrary.
  814 
  815 augmentSize :: ExprSize
  816 augmentSize = SizeIs 0 emptyBag 40
  817         -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
  818         -- e plus ys. The -2 accounts for the \cn
  819 
  820 -- When we return a lambda, give a discount if it's used (applied)
  821 lamScrutDiscount :: UnfoldingOpts -> ExprSize -> ExprSize
  822 lamScrutDiscount opts (SizeIs n vs _) = SizeIs n vs (unfoldingFunAppDiscount opts)
  823 lamScrutDiscount _      TooBig          = TooBig
  824 
  825 {-
  826 Note [addAltSize result discounts]
  827 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  828 When adding the size of alternatives, we *add* the result discounts
  829 too, rather than take the *maximum*.  For a multi-branch case, this
  830 gives a discount for each branch that returns a constructor, making us
  831 keener to inline.  I did try using 'max' instead, but it makes nofib
  832 'rewrite' and 'puzzle' allocate significantly more, and didn't make
  833 binary sizes shrink significantly either.
  834 
  835 Note [Discounts and thresholds]
  836 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  837 
  838 Constants for discounts and thresholds are defined in 'UnfoldingOpts'. They are:
  839 
  840 unfoldingCreationThreshold
  841      At a definition site, if the unfolding is bigger than this, we
  842      may discard it altogether
  843 
  844 unfoldingUseThreshold
  845      At a call site, if the unfolding, less discounts, is smaller than
  846      this, then it's small enough inline
  847 
  848 unfoldingDictDiscount
  849      The discount for each occurrence of a dictionary argument
  850      as an argument of a class method.  Should be pretty small
  851      else big functions may get inlined
  852 
  853 unfoldingFunAppDiscount
  854      Discount for a function argument that is applied.  Quite
  855      large, because if we inline we avoid the higher-order call.
  856 
  857 unfoldingVeryAggressive
  858      If True, the compiler ignores all the thresholds and inlines very
  859      aggressively. It still adheres to arity, simplifier phase control and
  860      loop breakers.
  861 
  862 
  863 Historical Note: Before April 2020 we had another factor,
  864 ufKeenessFactor, which would scale the discounts before they were subtracted
  865 from the size. This was justified with the following comment:
  866 
  867   -- We multiply the raw discounts (args_discount and result_discount)
  868   -- ty opt_UnfoldingKeenessFactor because the former have to do with
  869   --  *size* whereas the discounts imply that there's some extra
  870   --  *efficiency* to be gained (e.g. beta reductions, case reductions)
  871   -- by inlining.
  872 
  873 However, this is highly suspect since it means that we subtract a *scaled* size
  874 from an absolute size, resulting in crazy (e.g. negative) scores in some cases
  875 (#15304). We consequently killed off ufKeenessFactor and bumped up the
  876 ufUseThreshold to compensate.
  877 
  878 
  879 Note [Function applications]
  880 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  881 In a function application (f a b)
  882 
  883   - If 'f' is an argument to the function being analysed,
  884     and there's at least one value arg, record a FunAppDiscount for f
  885 
  886   - If the application if a PAP (arity > 2 in this example)
  887     record a *result* discount (because inlining
  888     with "extra" args in the call may mean that we now
  889     get a saturated application)
  890 
  891 Code for manipulating sizes
  892 -}
  893 
  894 -- | The size of a candidate expression for unfolding
  895 data ExprSize
  896     = TooBig
  897     | SizeIs { _es_size_is  :: {-# UNPACK #-} !Int -- ^ Size found
  898              , _es_args     :: !(Bag (Id,Int))
  899                -- ^ Arguments cased herein, and discount for each such
  900              , _es_discount :: {-# UNPACK #-} !Int
  901                -- ^ Size to subtract if result is scrutinised by a case
  902                -- expression
  903              }
  904 
  905 instance Outputable ExprSize where
  906   ppr TooBig         = text "TooBig"
  907   ppr (SizeIs a _ c) = brackets (int a <+> int c)
  908 
  909 -- subtract the discount before deciding whether to bale out. eg. we
  910 -- want to inline a large constructor application into a selector:
  911 --      tup = (a_1, ..., a_99)
  912 --      x = case tup of ...
  913 --
  914 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
  915 mkSizeIs max n xs d | (n - d) > max = TooBig
  916                     | otherwise     = SizeIs n xs d
  917 
  918 maxSize :: ExprSize -> ExprSize -> ExprSize
  919 maxSize TooBig         _                                  = TooBig
  920 maxSize _              TooBig                             = TooBig
  921 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2   = s1
  922                                               | otherwise = s2
  923 
  924 sizeZero :: ExprSize
  925 sizeN :: Int -> ExprSize
  926 
  927 sizeZero = SizeIs 0 emptyBag 0
  928 sizeN n  = SizeIs n emptyBag 0
  929 
  930 {-
  931 ************************************************************************
  932 *                                                                      *
  933 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
  934 *                                                                      *
  935 ************************************************************************
  936 
  937 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
  938 we ``couldn't possibly use'' on the other side.  Can be overridden w/
  939 flaggery.  Just the same as smallEnoughToInline, except that it has no
  940 actual arguments.
  941 -}
  942 
  943 couldBeSmallEnoughToInline :: UnfoldingOpts -> Int -> CoreExpr -> Bool
  944 couldBeSmallEnoughToInline opts threshold rhs
  945   = case sizeExpr opts threshold [] body of
  946        TooBig -> False
  947        _      -> True
  948   where
  949     (_, body) = collectBinders rhs
  950 
  951 ----------------
  952 smallEnoughToInline :: UnfoldingOpts -> Unfolding -> Bool
  953 smallEnoughToInline opts (CoreUnfolding {uf_guidance = guidance})
  954   = case guidance of
  955        UnfIfGoodArgs {ug_size = size} -> size <= unfoldingUseThreshold opts
  956        UnfWhen {} -> True
  957        UnfNever   -> False
  958 smallEnoughToInline _ _
  959   = False
  960 
  961 {-
  962 ************************************************************************
  963 *                                                                      *
  964 \subsection{callSiteInline}
  965 *                                                                      *
  966 ************************************************************************
  967 
  968 This is the key function.  It decides whether to inline a variable at a call site
  969 
  970 callSiteInline is used at call sites, so it is a bit more generous.
  971 It's a very important function that embodies lots of heuristics.
  972 A non-WHNF can be inlined if it doesn't occur inside a lambda,
  973 and occurs exactly once or
  974     occurs once in each branch of a case and is small
  975 
  976 If the thing is in WHNF, there's no danger of duplicating work,
  977 so we can inline if it occurs once, or is small
  978 
  979 NOTE: we don't want to inline top-level functions that always diverge.
  980 It just makes the code bigger.  Tt turns out that the convenient way to prevent
  981 them inlining is to give them a NOINLINE pragma, which we do in
  982 StrictAnal.addStrictnessInfoToTopId
  983 -}
  984 
  985 data ArgSummary = TrivArg       -- Nothing interesting
  986                 | NonTrivArg    -- Arg has structure
  987                 | ValueArg      -- Arg is a con-app or PAP
  988                                 -- ..or con-like. Note [Conlike is interesting]
  989 
  990 instance Outputable ArgSummary where
  991   ppr TrivArg    = text "TrivArg"
  992   ppr NonTrivArg = text "NonTrivArg"
  993   ppr ValueArg   = text "ValueArg"
  994 
  995 nonTriv ::  ArgSummary -> Bool
  996 nonTriv TrivArg = False
  997 nonTriv _       = True
  998 
  999 data CallCtxt
 1000   = BoringCtxt
 1001   | RhsCtxt             -- Rhs of a let-binding; see Note [RHS of lets]
 1002   | DiscArgCtxt         -- Argument of a function with non-zero arg discount
 1003   | RuleArgCtxt         -- We are somewhere in the argument of a function with rules
 1004 
 1005   | ValAppCtxt          -- We're applied to at least one value arg
 1006                         -- This arises when we have ((f x |> co) y)
 1007                         -- Then the (f x) has argument 'x' but in a ValAppCtxt
 1008 
 1009   | CaseCtxt            -- We're the scrutinee of a case
 1010                         -- that decomposes its scrutinee
 1011 
 1012 instance Outputable CallCtxt where
 1013   ppr CaseCtxt    = text "CaseCtxt"
 1014   ppr ValAppCtxt  = text "ValAppCtxt"
 1015   ppr BoringCtxt  = text "BoringCtxt"
 1016   ppr RhsCtxt     = text "RhsCtxt"
 1017   ppr DiscArgCtxt = text "DiscArgCtxt"
 1018   ppr RuleArgCtxt = text "RuleArgCtxt"
 1019 
 1020 callSiteInline :: Logger
 1021                -> UnfoldingOpts
 1022                -> Int                   -- Case depth
 1023                -> Id                    -- The Id
 1024                -> Bool                  -- True <=> unfolding is active
 1025                -> Bool                  -- True if there are no arguments at all (incl type args)
 1026                -> [ArgSummary]          -- One for each value arg; True if it is interesting
 1027                -> CallCtxt              -- True <=> continuation is interesting
 1028                -> Maybe CoreExpr        -- Unfolding, if any
 1029 callSiteInline logger opts !case_depth id active_unfolding lone_variable arg_infos cont_info
 1030   = case idUnfolding id of
 1031       -- idUnfolding checks for loop-breakers, returning NoUnfolding
 1032       -- Things with an INLINE pragma may have an unfolding *and*
 1033       -- be a loop breaker  (maybe the knot is not yet untied)
 1034         CoreUnfolding { uf_tmpl = unf_template
 1035                       , uf_is_work_free = is_wf
 1036                       , uf_guidance = guidance, uf_expandable = is_exp }
 1037           | active_unfolding -> tryUnfolding logger opts case_depth id lone_variable
 1038                                     arg_infos cont_info unf_template
 1039                                     is_wf is_exp guidance
 1040           | otherwise -> traceInline logger opts id "Inactive unfolding:" (ppr id) Nothing
 1041         NoUnfolding      -> Nothing
 1042         BootUnfolding    -> Nothing
 1043         OtherCon {}      -> Nothing
 1044         DFunUnfolding {} -> Nothing     -- Never unfold a DFun
 1045 
 1046 -- | Report the inlining of an identifier's RHS to the user, if requested.
 1047 traceInline :: Logger -> UnfoldingOpts -> Id -> String -> SDoc -> a -> a
 1048 traceInline logger opts inline_id str doc result
 1049   -- We take care to ensure that doc is used in only one branch, ensuring that
 1050   -- the simplifier can push its allocation into the branch. See Note [INLINE
 1051   -- conditional tracing utilities].
 1052   | enable    = logTraceMsg logger str doc result
 1053   | otherwise = result
 1054   where
 1055     enable
 1056       | logHasDumpFlag logger Opt_D_dump_verbose_inlinings
 1057       = True
 1058       | Just prefix <- unfoldingReportPrefix opts
 1059       = prefix `isPrefixOf` occNameString (getOccName inline_id)
 1060       | otherwise
 1061       = False
 1062 {-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities]
 1063 
 1064 {- Note [Avoid inlining into deeply nested cases]
 1065    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 1066 
 1067 Consider a function f like this:
 1068 
 1069   f arg1 arg2 =
 1070     case ...
 1071       ... -> g arg1
 1072       ... -> g arg2
 1073 
 1074 This function is small. So should be safe to inline.
 1075 However sometimes this doesn't quite work out like that.
 1076 Consider this code:
 1077 
 1078 f1 arg1 arg2 ... = ...
 1079     case _foo of
 1080       alt1 -> ... f2 arg1 ...
 1081       alt2 -> ... f2 arg2 ...
 1082 
 1083 f2 arg1 arg2 ... = ...
 1084     case _foo of
 1085       alt1 -> ... f3 arg1 ...
 1086       alt2 -> ... f3 arg2 ...
 1087 
 1088 f3 arg1 arg2 ... = ...
 1089 
 1090 ... repeats up to n times. And then f1 is
 1091 applied to some arguments:
 1092 
 1093 foo = ... f1 <interestingArgs> ...
 1094 
 1095 Initially f2..fn are not interesting to inline so we don't.
 1096 However we see that f1 is applied to interesting args.
 1097 So it's an obvious choice to inline those:
 1098 
 1099 foo =
 1100     ...
 1101       case _foo of
 1102         alt1 -> ... f2 <interestingArg> ...
 1103         alt2 -> ... f2 <interestingArg> ...
 1104 
 1105 As a result we go and inline f2 both mentions of f2 in turn are now applied to interesting
 1106 arguments and f2 is small:
 1107 
 1108 foo =
 1109     ...
 1110       case _foo of
 1111         alt1 -> ... case _foo of
 1112             alt1 -> ... f3 <interestingArg> ...
 1113             alt2 -> ... f3 <interestingArg> ...
 1114 
 1115         alt2 -> ... case _foo of
 1116             alt1 -> ... f3 <interestingArg> ...
 1117             alt2 -> ... f3 <interestingArg> ...
 1118 
 1119 The same thing happens for each binding up to f_n, duplicating the amount of inlining
 1120 done in each step. Until at some point we are either done or run out of simplifier
 1121 ticks/RAM. This pattern happened #18730.
 1122 
 1123 To combat this we introduce one more heuristic when weighing inlining decision.
 1124 We keep track of a "case-depth". Which increases each time we look inside a case
 1125 expression with more than one alternative.
 1126 
 1127 We then apply a penalty to inlinings based on the case-depth at which they would
 1128 be inlined. Bounding the number of inlinings in such a scenario.
 1129 
 1130 The heuristic can be tuned in two ways:
 1131 
 1132 * We can ignore the first n levels of case nestings for inlining decisions using
 1133   -funfolding-case-threshold.
 1134 * The penalty grows linear with the depth. It's computed as size*(depth-threshold)/scaling.
 1135   Scaling can be set with -funfolding-case-scaling.
 1136 
 1137 Some guidance on setting these defaults:
 1138 
 1139 * A low treshold (<= 2) is needed to prevent exponential cases from spiraling out of
 1140   control. We picked 2 for no particular reason.
 1141 * Scaling the penalty by any more than 30 means the reproducer from
 1142   T18730 won't compile even with reasonably small values of n. Instead
 1143   it will run out of runs/ticks. This means to positively affect the reproducer
 1144   a scaling <= 30 is required.
 1145 * A scaling of >= 15 still causes a few very large regressions on some nofib benchmarks.
 1146   (+80% for gc/fulsom, +90% for real/ben-raytrace, +20% for spectral/fibheaps)
 1147 * A scaling of >= 25 showed no regressions on nofib. However it showed a number of
 1148   (small) regression for compiler perf benchmarks.
 1149 
 1150 The end result is that we are settling for a scaling of 30, with a threshold of 2.
 1151 This gives us minimal compiler perf regressions. No nofib runtime regressions and
 1152 will still avoid this pattern sometimes. This is a "safe" default, where we err on
 1153 the side of compiler blowup instead of risking runtime regressions.
 1154 
 1155 For cases where the default falls short the flag can be changed to allow more/less inlining as
 1156 needed on a per-module basis.
 1157 
 1158 -}
 1159 
 1160 tryUnfolding :: Logger -> UnfoldingOpts -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt
 1161              -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
 1162              -> Maybe CoreExpr
 1163 tryUnfolding logger opts !case_depth id lone_variable
 1164              arg_infos cont_info unf_template
 1165              is_wf is_exp guidance
 1166  = case guidance of
 1167      UnfNever -> traceInline logger opts id str (text "UnfNever") Nothing
 1168 
 1169      UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
 1170         | enough_args && (boring_ok || some_benefit || unfoldingVeryAggressive opts)
 1171                 -- See Note [INLINE for small functions (3)]
 1172         -> traceInline logger opts id str (mk_doc some_benefit empty True) (Just unf_template)
 1173         | otherwise
 1174         -> traceInline logger opts id str (mk_doc some_benefit empty False) Nothing
 1175         where
 1176           some_benefit = calc_some_benefit uf_arity
 1177           enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
 1178 
 1179      UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
 1180         | unfoldingVeryAggressive opts
 1181         -> traceInline logger opts id str (mk_doc some_benefit extra_doc True) (Just unf_template)
 1182         | is_wf && some_benefit && small_enough
 1183         -> traceInline logger opts id str (mk_doc some_benefit extra_doc True) (Just unf_template)
 1184         | otherwise
 1185         -> traceInline logger opts id str (mk_doc some_benefit extra_doc False) Nothing
 1186         where
 1187           some_benefit = calc_some_benefit (length arg_discounts)
 1188           extra_doc = vcat [ text "case depth =" <+> int case_depth
 1189                            , text "depth based penalty =" <+> int depth_penalty
 1190                            , text "discounted size =" <+> int adjusted_size ]
 1191           -- See Note [Avoid inlining into deeply nested cases]
 1192           depth_treshold = unfoldingCaseThreshold opts
 1193           depth_scaling = unfoldingCaseScaling opts
 1194           depth_penalty | case_depth <= depth_treshold = 0
 1195                         | otherwise       = (size * (case_depth - depth_treshold)) `div` depth_scaling
 1196           adjusted_size = size + depth_penalty - discount
 1197           small_enough = adjusted_size <= unfoldingUseThreshold opts
 1198           discount = computeDiscount arg_discounts res_discount arg_infos cont_info
 1199 
 1200   where
 1201     mk_doc some_benefit extra_doc yes_or_no
 1202       = vcat [ text "arg infos" <+> ppr arg_infos
 1203              , text "interesting continuation" <+> ppr cont_info
 1204              , text "some_benefit" <+> ppr some_benefit
 1205              , text "is exp:" <+> ppr is_exp
 1206              , text "is work-free:" <+> ppr is_wf
 1207              , text "guidance" <+> ppr guidance
 1208              , extra_doc
 1209              , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
 1210 
 1211     ctx = log_default_dump_context (logFlags logger)
 1212     str = "Considering inlining: " ++ renderWithContext ctx (ppr id)
 1213     n_val_args = length arg_infos
 1214 
 1215            -- some_benefit is used when the RHS is small enough
 1216            -- and the call has enough (or too many) value
 1217            -- arguments (ie n_val_args >= arity). But there must
 1218            -- be *something* interesting about some argument, or the
 1219            -- result context, to make it worth inlining
 1220     calc_some_benefit :: Arity -> Bool   -- The Arity is the number of args
 1221                                          -- expected by the unfolding
 1222     calc_some_benefit uf_arity
 1223        | not saturated = interesting_args       -- Under-saturated
 1224                                         -- Note [Unsaturated applications]
 1225        | otherwise = interesting_args   -- Saturated or over-saturated
 1226                   || interesting_call
 1227       where
 1228         saturated      = n_val_args >= uf_arity
 1229         over_saturated = n_val_args > uf_arity
 1230         interesting_args = any nonTriv arg_infos
 1231                 -- NB: (any nonTriv arg_infos) looks at the
 1232                 -- over-saturated args too which is "wrong";
 1233                 -- but if over-saturated we inline anyway.
 1234 
 1235         interesting_call
 1236           | over_saturated
 1237           = True
 1238           | otherwise
 1239           = case cont_info of
 1240               CaseCtxt   -> not (lone_variable && is_exp)  -- Note [Lone variables]
 1241               ValAppCtxt -> True                           -- Note [Cast then apply]
 1242               RuleArgCtxt -> uf_arity > 0  -- See Note [Unfold info lazy contexts]
 1243               DiscArgCtxt -> uf_arity > 0  -- Note [Inlining in ArgCtxt]
 1244               RhsCtxt     -> uf_arity > 0  --
 1245               _other      -> False         -- See Note [Nested functions]
 1246 
 1247 
 1248 {-
 1249 Note [Unfold into lazy contexts], Note [RHS of lets]
 1250 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 1251 When the call is the argument of a function with a RULE, or the RHS of a let,
 1252 we are a little bit keener to inline.  For example
 1253      f y = (y,y,y)
 1254      g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
 1255 We'd inline 'f' if the call was in a case context, and it kind-of-is,
 1256 only we can't see it.  Also
 1257      x = f v
 1258 could be expensive whereas
 1259      x = case v of (a,b) -> a
 1260 is patently cheap and may allow more eta expansion.
 1261 So we treat the RHS of a let as not-totally-boring.
 1262 
 1263 Note [Unsaturated applications]
 1264 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 1265 When a call is not saturated, we *still* inline if one of the
 1266 arguments has interesting structure.  That's sometimes very important.
 1267 A good example is the Ord instance for Bool in Base:
 1268 
 1269  Rec {
 1270     $fOrdBool =GHC.Classes.D:Ord
 1271                  @ Bool
 1272                  ...
 1273                  $cmin_ajX
 1274 
 1275     $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
 1276     $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
 1277   }
 1278 
 1279 But the defn of GHC.Classes.$dmmin is:
 1280 
 1281   $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
 1282     {- Arity: 3, HasNoCafRefs, Strictness: SLL,
 1283        Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
 1284                    case @ a GHC.Classes.<= @ a $dOrd x y of wild {
 1285                      GHC.Types.False -> y GHC.Types.True -> x }) -}
 1286 
 1287 We *really* want to inline $dmmin, even though it has arity 3, in
 1288 order to unravel the recursion.
 1289 
 1290 
 1291 Note [Things to watch]
 1292 ~~~~~~~~~~~~~~~~~~~~~~
 1293 *   { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
 1294     Assume x is exported, so not inlined unconditionally.
 1295     Then we want x to inline unconditionally; no reason for it
 1296     not to, and doing so avoids an indirection.
 1297 
 1298 *   { x = I# 3; ....f x.... }
 1299     Make sure that x does not inline unconditionally!
 1300     Lest we get extra allocation.
 1301 
 1302 Note [Inlining an InlineRule]
 1303 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 1304 An InlineRules is used for
 1305   (a) programmer INLINE pragmas
 1306   (b) inlinings from worker/wrapper
 1307 
 1308 For (a) the RHS may be large, and our contract is that we *only* inline
 1309 when the function is applied to all the arguments on the LHS of the
 1310 source-code defn.  (The uf_arity in the rule.)
 1311 
 1312 However for worker/wrapper it may be worth inlining even if the
 1313 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
 1314 require saturation.
 1315 
 1316 Note [Nested functions]
 1317 ~~~~~~~~~~~~~~~~~~~~~~~
 1318 At one time we treated a call of a non-top-level function as
 1319 "interesting" (regardless of how boring the context) in the hope
 1320 that inlining it would eliminate the binding, and its allocation.
 1321 Specifically, in the default case of interesting_call we had
 1322    _other -> not is_top && uf_arity > 0
 1323 
 1324 But actually postInlineUnconditionally does some of this and overall
 1325 it makes virtually no difference to nofib.  So I simplified away this
 1326 special case
 1327 
 1328 Note [Cast then apply]
 1329 ~~~~~~~~~~~~~~~~~~~~~~
 1330 Consider
 1331    myIndex = __inline_me ( (/\a. <blah>) |> co )
 1332    co :: (forall a. a -> a) ~ (forall a. T a)
 1333      ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
 1334 
 1335 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
 1336 no value arguments.  The ValAppCtxt gives it enough incentive to inline.
 1337 
 1338 Note [Inlining in ArgCtxt]
 1339 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 1340 The condition (arity > 0) here is very important, because otherwise
 1341 we end up inlining top-level stuff into useless places; eg
 1342    x = I# 3#
 1343    f = \y.  g x
 1344 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
 1345 and 20% to 'power'.
 1346 
 1347 At one stage I replaced this condition by 'True' (leading to the above
 1348 slow-down).  The motivation was test eyeball/inline1.hs; but that seems
 1349 to work ok now.
 1350 
 1351 NOTE: arguably, we should inline in ArgCtxt only if the result of the
 1352 call is at least CONLIKE.  At least for the cases where we use ArgCtxt
 1353 for the RHS of a 'let', we only profit from the inlining if we get a
 1354 CONLIKE thing (modulo lets).
 1355 
 1356 Note [Lone variables]   See also Note [Interaction of exprIsWorkFree and lone variables]
 1357 ~~~~~~~~~~~~~~~~~~~~~   which appears below
 1358 The "lone-variable" case is important.  I spent ages messing about
 1359 with unsatisfactory variants, but this is nice.  The idea is that if a
 1360 variable appears all alone
 1361 
 1362         as an arg of lazy fn, or rhs    BoringCtxt
 1363         as scrutinee of a case          CaseCtxt
 1364         as arg of a fn                  ArgCtxt
 1365 AND
 1366         it is bound to a cheap expression
 1367 
 1368 then we should not inline it (unless there is some other reason,
 1369 e.g. it is the sole occurrence).  That is what is happening at
 1370 the use of 'lone_variable' in 'interesting_call'.
 1371 
 1372 Why?  At least in the case-scrutinee situation, turning
 1373         let x = (a,b) in case x of y -> ...
 1374 into
 1375         let x = (a,b) in case (a,b) of y -> ...
 1376 and thence to
 1377         let x = (a,b) in let y = (a,b) in ...
 1378 is bad if the binding for x will remain.
 1379 
 1380 Another example: I discovered that strings
 1381 were getting inlined straight back into applications of 'error'
 1382 because the latter is strict.
 1383         s = "foo"
 1384         f = \x -> ...(error s)...
 1385 
 1386 Fundamentally such contexts should not encourage inlining because, provided
 1387 the RHS is "expandable" (see Note [exprIsExpandable] in GHC.Core.Utils) the
 1388 context can ``see'' the unfolding of the variable (e.g. case or a
 1389 RULE) so there's no gain.
 1390 
 1391 However, watch out:
 1392 
 1393  * Consider this:
 1394         foo = _inline_ (\n. [n])
 1395         bar = _inline_ (foo 20)
 1396         baz = \n. case bar of { (m:_) -> m + n }
 1397    Here we really want to inline 'bar' so that we can inline 'foo'
 1398    and the whole thing unravels as it should obviously do.  This is
 1399    important: in the NDP project, 'bar' generates a closure data
 1400    structure rather than a list.
 1401 
 1402    So the non-inlining of lone_variables should only apply if the
 1403    unfolding is regarded as cheap; because that is when exprIsConApp_maybe
 1404    looks through the unfolding.  Hence the "&& is_wf" in the
 1405    InlineRule branch.
 1406 
 1407  * Even a type application or coercion isn't a lone variable.
 1408    Consider
 1409         case $fMonadST @ RealWorld of { :DMonad a b c -> c }
 1410    We had better inline that sucker!  The case won't see through it.
 1411 
 1412    For now, I'm treating treating a variable applied to types
 1413    in a *lazy* context "lone". The motivating example was
 1414         f = /\a. \x. BIG
 1415         g = /\a. \y.  h (f a)
 1416    There's no advantage in inlining f here, and perhaps
 1417    a significant disadvantage.  Hence some_val_args in the Stop case
 1418 
 1419 Note [Interaction of exprIsWorkFree and lone variables]
 1420 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 1421 The lone-variable test says "don't inline if a case expression
 1422 scrutinises a lone variable whose unfolding is cheap".  It's very
 1423 important that, under these circumstances, exprIsConApp_maybe
 1424 can spot a constructor application. So, for example, we don't
 1425 consider
 1426         let x = e in (x,x)
 1427 to be cheap, and that's good because exprIsConApp_maybe doesn't
 1428 think that expression is a constructor application.
 1429 
 1430 In the 'not (lone_variable && is_wf)' test, I used to test is_value
 1431 rather than is_wf, which was utterly wrong, because the above
 1432 expression responds True to exprIsHNF, which is what sets is_value.
 1433 
 1434 This kind of thing can occur if you have
 1435 
 1436         {-# INLINE foo #-}
 1437         foo = let x = e in (x,x)
 1438 
 1439 which Roman did.
 1440 
 1441 
 1442 -}
 1443 
 1444 computeDiscount :: [Int] -> Int -> [ArgSummary] -> CallCtxt
 1445                 -> Int
 1446 computeDiscount arg_discounts res_discount arg_infos cont_info
 1447 
 1448   = 10          -- Discount of 10 because the result replaces the call
 1449                 -- so we count 10 for the function itself
 1450 
 1451     + 10 * length actual_arg_discounts
 1452                -- Discount of 10 for each arg supplied,
 1453                -- because the result replaces the call
 1454 
 1455     + total_arg_discount + res_discount'
 1456   where
 1457     actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
 1458     total_arg_discount   = sum actual_arg_discounts
 1459 
 1460     mk_arg_discount _        TrivArg    = 0
 1461     mk_arg_discount _        NonTrivArg = 10
 1462     mk_arg_discount discount ValueArg   = discount
 1463 
 1464     res_discount'
 1465       | LT <- arg_discounts `compareLength` arg_infos
 1466       = res_discount   -- Over-saturated
 1467       | otherwise
 1468       = case cont_info of
 1469            BoringCtxt  -> 0
 1470            CaseCtxt    -> res_discount  -- Presumably a constructor
 1471            ValAppCtxt  -> res_discount  -- Presumably a function
 1472            _           -> 40 `min` res_discount
 1473                 -- ToDo: this 40 `min` res_discount doesn't seem right
 1474                 --   for DiscArgCtxt it shouldn't matter because the function will
 1475                 --       get the arg discount for any non-triv arg
 1476                 --   for RuleArgCtxt we do want to be keener to inline; but not only
 1477                 --       constructor results
 1478                 --   for RhsCtxt I suppose that exposing a data con is good in general
 1479                 --   And 40 seems very arbitrary
 1480                 --
 1481                 -- res_discount can be very large when a function returns
 1482                 -- constructors; but we only want to invoke that large discount
 1483                 -- when there's a case continuation.
 1484                 -- Otherwise we, rather arbitrarily, threshold it.  Yuk.
 1485                 -- But we want to avoid inlining large functions that return
 1486                 -- constructors into contexts that are simply "interesting"