never executed always true always false
1 {-# LANGUAGE BangPatterns #-}
2
3 module GHC.Cmm.Pipeline (
4 -- | Converts C-- with an implicit stack and native C-- calls into
5 -- optimized, CPS converted and native-call-less C--. The latter
6 -- C-- can be used to generate assembly.
7 cmmPipeline
8 ) where
9
10 import GHC.Prelude
11
12 import GHC.Cmm
13 import GHC.Cmm.Lint
14 import GHC.Cmm.Info.Build
15 import GHC.Cmm.CommonBlockElim
16 import GHC.Cmm.Switch.Implement
17 import GHC.Cmm.ProcPoint
18 import GHC.Cmm.ContFlowOpt
19 import GHC.Cmm.LayoutStack
20 import GHC.Cmm.Sink
21 import GHC.Cmm.Dataflow.Collections
22
23 import GHC.Types.Unique.Supply
24 import GHC.Driver.Session
25 import GHC.Driver.Backend
26 import GHC.Utils.Error
27 import GHC.Utils.Logger
28 import GHC.Driver.Env
29 import Control.Monad
30 import GHC.Utils.Outputable
31 import GHC.Platform
32 import Data.Either (partitionEithers)
33
34 -----------------------------------------------------------------------------
35 -- | Top level driver for C-- pipeline
36 -----------------------------------------------------------------------------
37
38 cmmPipeline
39 :: HscEnv -- Compilation env including
40 -- dynamic flags: -dcmm-lint -ddump-cmm-cps
41 -> ModuleSRTInfo -- Info about SRTs generated so far
42 -> CmmGroup -- Input C-- with Procedures
43 -> IO (ModuleSRTInfo, CmmGroupSRTs) -- Output CPS transformed C--
44
45 cmmPipeline hsc_env srtInfo prog = do
46 let logger = hsc_logger hsc_env
47 let dflags = hsc_dflags hsc_env
48 let forceRes (info, group) = info `seq` foldr (\decl r -> decl `seq` r) () group
49 let platform = targetPlatform dflags
50 withTimingSilent logger (text "Cmm pipeline") forceRes $ do
51 tops <- {-# SCC "tops" #-} mapM (cpsTop logger platform dflags) prog
52
53 let (procs, data_) = partitionEithers tops
54 (srtInfo, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags srtInfo procs data_
55 dumpWith logger Opt_D_dump_cmm_cps "Post CPS Cmm" FormatCMM (pdoc platform cmms)
56
57 return (srtInfo, cmms)
58
59
60 cpsTop :: Logger -> Platform -> DynFlags -> CmmDecl -> IO (Either (CAFEnv, [CmmDecl]) (CAFSet, CmmDecl))
61 cpsTop _logger platform _ p@(CmmData _ statics) = return (Right (cafAnalData platform statics, p))
62 cpsTop logger platform dflags proc =
63 do
64 ----------- Control-flow optimisations ----------------------------------
65
66 -- The first round of control-flow optimisation speeds up the
67 -- later passes by removing lots of empty blocks, so we do it
68 -- even when optimisation isn't turned on.
69 --
70 CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
71 return $ cmmCfgOptsProc splitting_proc_points proc
72 dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
73
74 let !TopInfo {stack_info=StackInfo { arg_space = entry_off
75 , do_layout = do_layout }} = h
76
77 ----------- Eliminate common blocks -------------------------------------
78 g <- {-# SCC "elimCommonBlocks" #-}
79 condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
80 Opt_D_dump_cmm_cbe "Post common block elimination"
81
82 -- Any work storing block Labels must be performed _after_
83 -- elimCommonBlocks
84
85 ----------- Implement switches ------------------------------------------
86 g <- {-# SCC "createSwitchPlans" #-}
87 runUniqSM $ cmmImplementSwitchPlans (backend dflags) platform g
88 dump Opt_D_dump_cmm_switch "Post switch plan" g
89
90 ----------- Proc points -------------------------------------------------
91 let
92 call_pps :: ProcPointSet -- LabelMap
93 call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
94 proc_points <-
95 if splitting_proc_points
96 then do
97 pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
98 minimalProcPointSet platform call_pps g
99 dumpWith logger Opt_D_dump_cmm_proc "Proc points"
100 FormatCMM (pdoc platform l $$ ppr pp $$ pdoc platform g)
101 return pp
102 else
103 return call_pps
104
105 ----------- Layout the stack and manifest Sp ----------------------------
106 (g, stackmaps) <-
107 {-# SCC "layoutStack" #-}
108 if do_layout
109 then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
110 else return (g, mapEmpty)
111 dump Opt_D_dump_cmm_sp "Layout Stack" g
112
113 ----------- Sink and inline assignments --------------------------------
114 g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
115 condPass Opt_CmmSink (cmmSink platform) g
116 Opt_D_dump_cmm_sink "Sink assignments"
117
118 ------------- CAF analysis ----------------------------------------------
119 let cafEnv = {-# SCC "cafAnal" #-} cafAnal platform call_pps l g
120 dumpWith logger Opt_D_dump_cmm_caf "CAFEnv" FormatText (pdoc platform cafEnv)
121
122 g <- if splitting_proc_points
123 then do
124 ------------- Split into separate procedures -----------------------
125 let pp_map = {-# SCC "procPointAnalysis" #-}
126 procPointAnalysis proc_points g
127 dumpWith logger Opt_D_dump_cmm_procmap "procpoint map"
128 FormatCMM (ppr pp_map)
129 g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
130 splitAtProcPoints platform l call_pps proc_points pp_map
131 (CmmProc h l v g)
132 dumps Opt_D_dump_cmm_split "Post splitting" g
133 return g
134 else
135 -- attach info tables to return points
136 return $ [attachContInfoTables call_pps (CmmProc h l v g)]
137
138 ------------- Populate info tables with stack info -----------------
139 g <- {-# SCC "setInfoTableStackMap" #-}
140 return $ map (setInfoTableStackMap platform stackmaps) g
141 dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g
142
143 ----------- Control-flow optimisations -----------------------------
144 g <- {-# SCC "cmmCfgOpts(2)" #-}
145 return $ if optLevel dflags >= 1
146 then map (cmmCfgOptsProc splitting_proc_points) g
147 else g
148 g <- return (map removeUnreachableBlocksProc g)
149 -- See Note [unreachable blocks]
150 dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
151
152 return (Left (cafEnv, g))
153
154 where dump = dumpGraph logger platform dflags
155
156 dumps flag name
157 = mapM_ (dumpWith logger flag name FormatCMM . pdoc platform)
158
159 condPass flag pass g dumpflag dumpname =
160 if gopt flag dflags
161 then do
162 g <- return $ pass g
163 dump dumpflag dumpname g
164 return g
165 else return g
166
167 -- we don't need to split proc points for the NCG, unless
168 -- tablesNextToCode is off. The latter is because we have no
169 -- label to put on info tables for basic blocks that are not
170 -- the entry point.
171 splitting_proc_points = backend dflags /= NCG
172 || not (platformTablesNextToCode platform)
173 || -- Note [inconsistent-pic-reg]
174 usingInconsistentPicReg
175 usingInconsistentPicReg
176 = case (platformArch platform, platformOS platform, positionIndependent dflags)
177 of (ArchX86, OSDarwin, pic) -> pic
178 _ -> False
179
180 -- Note [Sinking after stack layout]
181 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
182 --
183 -- In the past we considered running sinking pass also before stack
184 -- layout, but after making some measurements we realized that:
185 --
186 -- a) running sinking only before stack layout produces slower
187 -- code than running sinking only before stack layout
188 --
189 -- b) running sinking both before and after stack layout produces
190 -- code that has the same performance as when running sinking
191 -- only after stack layout.
192 --
193 -- In other words sinking before stack layout doesn't buy as anything.
194 --
195 -- An interesting question is "why is it better to run sinking after
196 -- stack layout"? It seems that the major reason are stores and loads
197 -- generated by stack layout. Consider this code before stack layout:
198 --
199 -- c1E:
200 -- _c1C::P64 = R3;
201 -- _c1B::P64 = R2;
202 -- _c1A::P64 = R1;
203 -- I64[(young<c1D> + 8)] = c1D;
204 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
205 -- c1D:
206 -- R3 = _c1C::P64;
207 -- R2 = _c1B::P64;
208 -- R1 = _c1A::P64;
209 -- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
210 --
211 -- Stack layout pass will save all local variables live across a call
212 -- (_c1C, _c1B and _c1A in this example) on the stack just before
213 -- making a call and reload them from the stack after returning from a
214 -- call:
215 --
216 -- c1E:
217 -- _c1C::P64 = R3;
218 -- _c1B::P64 = R2;
219 -- _c1A::P64 = R1;
220 -- I64[Sp - 32] = c1D;
221 -- P64[Sp - 24] = _c1A::P64;
222 -- P64[Sp - 16] = _c1B::P64;
223 -- P64[Sp - 8] = _c1C::P64;
224 -- Sp = Sp - 32;
225 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
226 -- c1D:
227 -- _c1A::P64 = P64[Sp + 8];
228 -- _c1B::P64 = P64[Sp + 16];
229 -- _c1C::P64 = P64[Sp + 24];
230 -- R3 = _c1C::P64;
231 -- R2 = _c1B::P64;
232 -- R1 = _c1A::P64;
233 -- Sp = Sp + 32;
234 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
235 --
236 -- If we don't run sinking pass after stack layout we are basically
237 -- left with such code. However, running sinking on this code can lead
238 -- to significant improvements:
239 --
240 -- c1E:
241 -- I64[Sp - 32] = c1D;
242 -- P64[Sp - 24] = R1;
243 -- P64[Sp - 16] = R2;
244 -- P64[Sp - 8] = R3;
245 -- Sp = Sp - 32;
246 -- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
247 -- c1D:
248 -- R3 = P64[Sp + 24];
249 -- R2 = P64[Sp + 16];
250 -- R1 = P64[Sp + 8];
251 -- Sp = Sp + 32;
252 -- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
253 --
254 -- Now we only have 9 assignments instead of 15.
255 --
256 -- There is one case when running sinking before stack layout could
257 -- be beneficial. Consider this:
258 --
259 -- L1:
260 -- x = y
261 -- call f() returns L2
262 -- L2: ...x...y...
263 --
264 -- Since both x and y are live across a call to f, they will be stored
265 -- on the stack during stack layout and restored after the call:
266 --
267 -- L1:
268 -- x = y
269 -- P64[Sp - 24] = L2
270 -- P64[Sp - 16] = x
271 -- P64[Sp - 8] = y
272 -- Sp = Sp - 24
273 -- call f() returns L2
274 -- L2:
275 -- y = P64[Sp + 16]
276 -- x = P64[Sp + 8]
277 -- Sp = Sp + 24
278 -- ...x...y...
279 --
280 -- However, if we run sinking before stack layout we would propagate x
281 -- to its usage place (both x and y must be local register for this to
282 -- be possible - global registers cannot be floated past a call):
283 --
284 -- L1:
285 -- x = y
286 -- call f() returns L2
287 -- L2: ...y...y...
288 --
289 -- Thus making x dead at the call to f(). If we ran stack layout now
290 -- we would generate less stores and loads:
291 --
292 -- L1:
293 -- x = y
294 -- P64[Sp - 16] = L2
295 -- P64[Sp - 8] = y
296 -- Sp = Sp - 16
297 -- call f() returns L2
298 -- L2:
299 -- y = P64[Sp + 8]
300 -- Sp = Sp + 16
301 -- ...y...y...
302 --
303 -- But since we don't see any benefits from running sinking before stack
304 -- layout, this situation probably doesn't arise too often in practice.
305 --
306
307 {- Note [inconsistent-pic-reg]
308
309 On x86/Darwin, PIC is implemented by inserting a sequence like
310
311 call 1f
312 1: popl %reg
313
314 at the proc entry point, and then referring to labels as offsets from
315 %reg. If we don't split proc points, then we could have many entry
316 points in a proc that would need this sequence, and each entry point
317 would then get a different value for %reg. If there are any join
318 points, then at the join point we don't have a consistent value for
319 %reg, so we don't know how to refer to labels.
320
321 Hence, on x86/Darwin, we have to split proc points, and then each proc
322 point will get its own PIC initialisation sequence.
323
324 This isn't an issue on x86/ELF, where the sequence is
325
326 call 1f
327 1: popl %reg
328 addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
329
330 so %reg always has a consistent value: the address of
331 _GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
332
333 -}
334
335 {- Note [unreachable blocks]
336
337 The control-flow optimiser sometimes leaves unreachable blocks behind
338 containing junk code. These aren't necessarily a problem, but
339 removing them is good because it might save time in the native code
340 generator later.
341
342 -}
343
344 runUniqSM :: UniqSM a -> IO a
345 runUniqSM m = do
346 us <- mkSplitUniqSupply 'u'
347 return (initUs_ us m)
348
349
350 dumpGraph :: Logger -> Platform -> DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
351 dumpGraph logger platform dflags flag name g = do
352 when (gopt Opt_DoCmmLinting dflags) $ do_lint g
353 dumpWith logger flag name FormatCMM (pdoc platform g)
354 where
355 do_lint g = case cmmLintGraph platform g of
356 Just err -> do { fatalErrorMsg logger err
357 ; ghcExit logger 1
358 }
359 Nothing -> return ()
360
361 dumpWith :: Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
362 dumpWith logger flag txt fmt sdoc = do
363 putDumpFileMaybe logger flag txt fmt sdoc
364 when (not (logHasDumpFlag logger flag)) $
365 -- If `-ddump-cmm-verbose -ddump-to-file` is specified,
366 -- dump each Cmm pipeline stage output to a separate file. #16930
367 when (logHasDumpFlag logger Opt_D_dump_cmm_verbose)
368 $ logDumpFile logger (mkDumpStyle alwaysQualify) flag txt fmt sdoc
369 putDumpFileMaybe logger Opt_D_dump_cmm_verbose_by_proc txt fmt sdoc