never executed always true always false
    1 {-# LANGUAGE BangPatterns #-}
    2 
    3 module GHC.Cmm.Pipeline (
    4   -- | Converts C-- with an implicit stack and native C-- calls into
    5   -- optimized, CPS converted and native-call-less C--.  The latter
    6   -- C-- can be used to generate assembly.
    7   cmmPipeline
    8 ) where
    9 
   10 import GHC.Prelude
   11 
   12 import GHC.Cmm
   13 import GHC.Cmm.Lint
   14 import GHC.Cmm.Info.Build
   15 import GHC.Cmm.CommonBlockElim
   16 import GHC.Cmm.Switch.Implement
   17 import GHC.Cmm.ProcPoint
   18 import GHC.Cmm.ContFlowOpt
   19 import GHC.Cmm.LayoutStack
   20 import GHC.Cmm.Sink
   21 import GHC.Cmm.Dataflow.Collections
   22 
   23 import GHC.Types.Unique.Supply
   24 import GHC.Driver.Session
   25 import GHC.Driver.Backend
   26 import GHC.Utils.Error
   27 import GHC.Utils.Logger
   28 import GHC.Driver.Env
   29 import Control.Monad
   30 import GHC.Utils.Outputable
   31 import GHC.Platform
   32 import Data.Either (partitionEithers)
   33 
   34 -----------------------------------------------------------------------------
   35 -- | Top level driver for C-- pipeline
   36 -----------------------------------------------------------------------------
   37 
   38 cmmPipeline
   39  :: HscEnv -- Compilation env including
   40            -- dynamic flags: -dcmm-lint -ddump-cmm-cps
   41  -> ModuleSRTInfo        -- Info about SRTs generated so far
   42  -> CmmGroup             -- Input C-- with Procedures
   43  -> IO (ModuleSRTInfo, CmmGroupSRTs) -- Output CPS transformed C--
   44 
   45 cmmPipeline hsc_env srtInfo prog = do
   46   let logger = hsc_logger hsc_env
   47   let dflags = hsc_dflags hsc_env
   48   let forceRes (info, group) = info `seq` foldr (\decl r -> decl `seq` r) () group
   49   let platform = targetPlatform dflags
   50   withTimingSilent logger (text "Cmm pipeline") forceRes $ do
   51      tops <- {-# SCC "tops" #-} mapM (cpsTop logger platform dflags) prog
   52 
   53      let (procs, data_) = partitionEithers tops
   54      (srtInfo, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags srtInfo procs data_
   55      dumpWith logger Opt_D_dump_cmm_cps "Post CPS Cmm" FormatCMM (pdoc platform cmms)
   56 
   57      return (srtInfo, cmms)
   58 
   59 
   60 cpsTop :: Logger -> Platform -> DynFlags -> CmmDecl -> IO (Either (CAFEnv, [CmmDecl]) (CAFSet, CmmDecl))
   61 cpsTop _logger platform _ p@(CmmData _ statics) = return (Right (cafAnalData platform statics, p))
   62 cpsTop logger platform dflags proc =
   63     do
   64       ----------- Control-flow optimisations ----------------------------------
   65 
   66       -- The first round of control-flow optimisation speeds up the
   67       -- later passes by removing lots of empty blocks, so we do it
   68       -- even when optimisation isn't turned on.
   69       --
   70       CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
   71            return $ cmmCfgOptsProc splitting_proc_points proc
   72       dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
   73 
   74       let !TopInfo {stack_info=StackInfo { arg_space = entry_off
   75                                          , do_layout = do_layout }} = h
   76 
   77       ----------- Eliminate common blocks -------------------------------------
   78       g <- {-# SCC "elimCommonBlocks" #-}
   79            condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
   80                          Opt_D_dump_cmm_cbe "Post common block elimination"
   81 
   82       -- Any work storing block Labels must be performed _after_
   83       -- elimCommonBlocks
   84 
   85       ----------- Implement switches ------------------------------------------
   86       g <- {-# SCC "createSwitchPlans" #-}
   87            runUniqSM $ cmmImplementSwitchPlans (backend dflags) platform g
   88       dump Opt_D_dump_cmm_switch "Post switch plan" g
   89 
   90       ----------- Proc points -------------------------------------------------
   91       let
   92         call_pps :: ProcPointSet -- LabelMap
   93         call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
   94       proc_points <-
   95          if splitting_proc_points
   96             then do
   97               pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
   98                  minimalProcPointSet platform call_pps g
   99               dumpWith logger Opt_D_dump_cmm_proc "Proc points"
  100                     FormatCMM (pdoc platform l $$ ppr pp $$ pdoc platform g)
  101               return pp
  102             else
  103               return call_pps
  104 
  105       ----------- Layout the stack and manifest Sp ----------------------------
  106       (g, stackmaps) <-
  107            {-# SCC "layoutStack" #-}
  108            if do_layout
  109               then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
  110               else return (g, mapEmpty)
  111       dump Opt_D_dump_cmm_sp "Layout Stack" g
  112 
  113       ----------- Sink and inline assignments  --------------------------------
  114       g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
  115            condPass Opt_CmmSink (cmmSink platform) g
  116                     Opt_D_dump_cmm_sink "Sink assignments"
  117 
  118       ------------- CAF analysis ----------------------------------------------
  119       let cafEnv = {-# SCC "cafAnal" #-} cafAnal platform call_pps l g
  120       dumpWith logger Opt_D_dump_cmm_caf "CAFEnv" FormatText (pdoc platform cafEnv)
  121 
  122       g <- if splitting_proc_points
  123            then do
  124              ------------- Split into separate procedures -----------------------
  125              let pp_map = {-# SCC "procPointAnalysis" #-}
  126                           procPointAnalysis proc_points g
  127              dumpWith logger Opt_D_dump_cmm_procmap "procpoint map"
  128                 FormatCMM (ppr pp_map)
  129              g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
  130                   splitAtProcPoints platform l call_pps proc_points pp_map
  131                                     (CmmProc h l v g)
  132              dumps Opt_D_dump_cmm_split "Post splitting" g
  133              return g
  134            else
  135              -- attach info tables to return points
  136              return $ [attachContInfoTables call_pps (CmmProc h l v g)]
  137 
  138       ------------- Populate info tables with stack info -----------------
  139       g <- {-# SCC "setInfoTableStackMap" #-}
  140            return $ map (setInfoTableStackMap platform stackmaps) g
  141       dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g
  142 
  143       ----------- Control-flow optimisations -----------------------------
  144       g <- {-# SCC "cmmCfgOpts(2)" #-}
  145            return $ if optLevel dflags >= 1
  146                     then map (cmmCfgOptsProc splitting_proc_points) g
  147                     else g
  148       g <- return (map removeUnreachableBlocksProc g)
  149            -- See Note [unreachable blocks]
  150       dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
  151 
  152       return (Left (cafEnv, g))
  153 
  154   where dump = dumpGraph logger platform dflags
  155 
  156         dumps flag name
  157            = mapM_ (dumpWith logger flag name FormatCMM . pdoc platform)
  158 
  159         condPass flag pass g dumpflag dumpname =
  160             if gopt flag dflags
  161                then do
  162                     g <- return $ pass g
  163                     dump dumpflag dumpname g
  164                     return g
  165                else return g
  166 
  167         -- we don't need to split proc points for the NCG, unless
  168         -- tablesNextToCode is off.  The latter is because we have no
  169         -- label to put on info tables for basic blocks that are not
  170         -- the entry point.
  171         splitting_proc_points = backend dflags /= NCG
  172                              || not (platformTablesNextToCode platform)
  173                              || -- Note [inconsistent-pic-reg]
  174                                 usingInconsistentPicReg
  175         usingInconsistentPicReg
  176            = case (platformArch platform, platformOS platform, positionIndependent dflags)
  177              of   (ArchX86, OSDarwin, pic) -> pic
  178                   _                        -> False
  179 
  180 -- Note [Sinking after stack layout]
  181 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  182 --
  183 -- In the past we considered running sinking pass also before stack
  184 -- layout, but after making some measurements we realized that:
  185 --
  186 --   a) running sinking only before stack layout produces slower
  187 --      code than running sinking only before stack layout
  188 --
  189 --   b) running sinking both before and after stack layout produces
  190 --      code that has the same performance as when running sinking
  191 --      only after stack layout.
  192 --
  193 -- In other words sinking before stack layout doesn't buy as anything.
  194 --
  195 -- An interesting question is "why is it better to run sinking after
  196 -- stack layout"? It seems that the major reason are stores and loads
  197 -- generated by stack layout. Consider this code before stack layout:
  198 --
  199 --  c1E:
  200 --      _c1C::P64 = R3;
  201 --      _c1B::P64 = R2;
  202 --      _c1A::P64 = R1;
  203 --      I64[(young<c1D> + 8)] = c1D;
  204 --      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
  205 --  c1D:
  206 --      R3 = _c1C::P64;
  207 --      R2 = _c1B::P64;
  208 --      R1 = _c1A::P64;
  209 --      call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
  210 --
  211 -- Stack layout pass will save all local variables live across a call
  212 -- (_c1C, _c1B and _c1A in this example) on the stack just before
  213 -- making a call and reload them from the stack after returning from a
  214 -- call:
  215 --
  216 --  c1E:
  217 --      _c1C::P64 = R3;
  218 --      _c1B::P64 = R2;
  219 --      _c1A::P64 = R1;
  220 --      I64[Sp - 32] = c1D;
  221 --      P64[Sp - 24] = _c1A::P64;
  222 --      P64[Sp - 16] = _c1B::P64;
  223 --      P64[Sp - 8] = _c1C::P64;
  224 --      Sp = Sp - 32;
  225 --      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
  226 --  c1D:
  227 --      _c1A::P64 = P64[Sp + 8];
  228 --      _c1B::P64 = P64[Sp + 16];
  229 --      _c1C::P64 = P64[Sp + 24];
  230 --      R3 = _c1C::P64;
  231 --      R2 = _c1B::P64;
  232 --      R1 = _c1A::P64;
  233 --      Sp = Sp + 32;
  234 --      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
  235 --
  236 -- If we don't run sinking pass after stack layout we are basically
  237 -- left with such code. However, running sinking on this code can lead
  238 -- to significant improvements:
  239 --
  240 --  c1E:
  241 --      I64[Sp - 32] = c1D;
  242 --      P64[Sp - 24] = R1;
  243 --      P64[Sp - 16] = R2;
  244 --      P64[Sp - 8] = R3;
  245 --      Sp = Sp - 32;
  246 --      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
  247 --  c1D:
  248 --      R3 = P64[Sp + 24];
  249 --      R2 = P64[Sp + 16];
  250 --      R1 = P64[Sp + 8];
  251 --      Sp = Sp + 32;
  252 --      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
  253 --
  254 -- Now we only have 9 assignments instead of 15.
  255 --
  256 -- There is one case when running sinking before stack layout could
  257 -- be beneficial. Consider this:
  258 --
  259 --   L1:
  260 --      x = y
  261 --      call f() returns L2
  262 --   L2: ...x...y...
  263 --
  264 -- Since both x and y are live across a call to f, they will be stored
  265 -- on the stack during stack layout and restored after the call:
  266 --
  267 --   L1:
  268 --      x = y
  269 --      P64[Sp - 24] = L2
  270 --      P64[Sp - 16] = x
  271 --      P64[Sp - 8]  = y
  272 --      Sp = Sp - 24
  273 --      call f() returns L2
  274 --   L2:
  275 --      y = P64[Sp + 16]
  276 --      x = P64[Sp + 8]
  277 --      Sp = Sp + 24
  278 --      ...x...y...
  279 --
  280 -- However, if we run sinking before stack layout we would propagate x
  281 -- to its usage place (both x and y must be local register for this to
  282 -- be possible - global registers cannot be floated past a call):
  283 --
  284 --   L1:
  285 --      x = y
  286 --      call f() returns L2
  287 --   L2: ...y...y...
  288 --
  289 -- Thus making x dead at the call to f(). If we ran stack layout now
  290 -- we would generate less stores and loads:
  291 --
  292 --   L1:
  293 --      x = y
  294 --      P64[Sp - 16] = L2
  295 --      P64[Sp - 8]  = y
  296 --      Sp = Sp - 16
  297 --      call f() returns L2
  298 --   L2:
  299 --      y = P64[Sp + 8]
  300 --      Sp = Sp + 16
  301 --      ...y...y...
  302 --
  303 -- But since we don't see any benefits from running sinking before stack
  304 -- layout, this situation probably doesn't arise too often in practice.
  305 --
  306 
  307 {- Note [inconsistent-pic-reg]
  308 
  309 On x86/Darwin, PIC is implemented by inserting a sequence like
  310 
  311     call 1f
  312  1: popl %reg
  313 
  314 at the proc entry point, and then referring to labels as offsets from
  315 %reg.  If we don't split proc points, then we could have many entry
  316 points in a proc that would need this sequence, and each entry point
  317 would then get a different value for %reg.  If there are any join
  318 points, then at the join point we don't have a consistent value for
  319 %reg, so we don't know how to refer to labels.
  320 
  321 Hence, on x86/Darwin, we have to split proc points, and then each proc
  322 point will get its own PIC initialisation sequence.
  323 
  324 This isn't an issue on x86/ELF, where the sequence is
  325 
  326     call 1f
  327  1: popl %reg
  328     addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
  329 
  330 so %reg always has a consistent value: the address of
  331 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
  332 
  333 -}
  334 
  335 {- Note [unreachable blocks]
  336 
  337 The control-flow optimiser sometimes leaves unreachable blocks behind
  338 containing junk code.  These aren't necessarily a problem, but
  339 removing them is good because it might save time in the native code
  340 generator later.
  341 
  342 -}
  343 
  344 runUniqSM :: UniqSM a -> IO a
  345 runUniqSM m = do
  346   us <- mkSplitUniqSupply 'u'
  347   return (initUs_ us m)
  348 
  349 
  350 dumpGraph :: Logger -> Platform -> DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
  351 dumpGraph logger platform dflags flag name g = do
  352   when (gopt Opt_DoCmmLinting dflags) $ do_lint g
  353   dumpWith logger flag name FormatCMM (pdoc platform g)
  354  where
  355   do_lint g = case cmmLintGraph platform g of
  356                  Just err -> do { fatalErrorMsg logger err
  357                                 ; ghcExit logger 1
  358                                 }
  359                  Nothing  -> return ()
  360 
  361 dumpWith :: Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
  362 dumpWith logger flag txt fmt sdoc = do
  363   putDumpFileMaybe logger flag txt fmt sdoc
  364   when (not (logHasDumpFlag logger flag)) $
  365     -- If `-ddump-cmm-verbose -ddump-to-file` is specified,
  366     -- dump each Cmm pipeline stage output to a separate file.  #16930
  367     when (logHasDumpFlag logger Opt_D_dump_cmm_verbose)
  368       $ logDumpFile logger (mkDumpStyle alwaysQualify) flag txt fmt sdoc
  369   putDumpFileMaybe logger Opt_D_dump_cmm_verbose_by_proc txt fmt sdoc