never executed always true always false
1 module GHC.CmmToAsm.Reg.Graph.TrivColorable (
2 trivColorable,
3 )
4
5 where
6
7 import GHC.Prelude
8
9 import GHC.Platform.Reg.Class
10 import GHC.Platform.Reg
11
12 import GHC.Data.Graph.Base
13
14 import GHC.Types.Unique.Set
15 import GHC.Platform
16 import GHC.Utils.Panic
17
18 -- trivColorable ---------------------------------------------------------------
19
20 -- trivColorable function for the graph coloring allocator
21 --
22 -- This gets hammered by scanGraph during register allocation,
23 -- so needs to be fairly efficient.
24 --
25 -- NOTE: This only works for architectures with just RcInteger and RcDouble
26 -- (which are disjoint) ie. x86, x86_64 and ppc
27 --
28 -- The number of allocatable regs is hard coded in here so we can do
29 -- a fast comparison in trivColorable.
30 --
31 -- It's ok if these numbers are _less_ than the actual number of free
32 -- regs, but they can't be more or the register conflict
33 -- graph won't color.
34 --
35 -- If the graph doesn't color then the allocator will panic, but it won't
36 -- generate bad object code or anything nasty like that.
37 --
38 -- There is an allocatableRegsInClass :: RegClass -> Int, but doing
39 -- the unboxing is too slow for us here.
40 -- TODO: Is that still true? Could we use allocatableRegsInClass
41 -- without losing performance now?
42 --
43 -- Look at rts/include/stg/MachRegs.h to get the numbers.
44 --
45
46
47 -- Disjoint registers ----------------------------------------------------------
48 --
49 -- The definition has been unfolded into individual cases for speed.
50 -- Each architecture has a different register setup, so we use a
51 -- different regSqueeze function for each.
52 --
53 accSqueeze
54 :: Int
55 -> Int
56 -> (reg -> Int)
57 -> UniqSet reg
58 -> Int
59
60 accSqueeze count maxCount squeeze us = acc count (nonDetEltsUniqSet us)
61 -- See Note [Unique Determinism and code generation]
62 where acc count [] = count
63 acc count _ | count >= maxCount = count
64 acc count (r:rs) = acc (count + squeeze r) rs
65
66 {- Note [accSqueeze]
67 ~~~~~~~~~~~~~~~~~~~~
68 BL 2007/09
69 Doing a nice fold over the UniqSet makes trivColorable use
70 32% of total compile time and 42% of total alloc when compiling SHA1.hs from darcs.
71 Therefore the UniqFM is made non-abstract and we use custom fold.
72
73 MS 2010/04
74 When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal
75 representation any more. But it is imperative that the accSqueeze stops
76 the folding if the count gets greater or equal to maxCount. We thus convert
77 UniqFM to a (lazy) list, do the fold and stops if necessary, which was
78 the most efficient variant tried. Benchmark compiling 10-times SHA1.hs follows.
79 (original = previous implementation, folding = fold of the whole UFM,
80 lazyFold = the current implementation,
81 hackFold = using internal representation of Data.IntMap)
82
83 original folding hackFold lazyFold
84 -O -fasm (used everywhere) 31.509s 30.387s 30.791s 30.603s
85 100.00% 96.44% 97.72% 97.12%
86 -fregs-graph 67.938s 74.875s 62.673s 64.679s
87 100.00% 110.21% 92.25% 95.20%
88 -fregs-iterative 89.761s 143.913s 81.075s 86.912s
89 100.00% 160.33% 90.32% 96.83%
90 -fnew-codegen 38.225s 37.142s 37.551s 37.119s
91 100.00% 97.17% 98.24% 97.11%
92 -fnew-codegen -fregs-graph 91.786s 91.51s 87.368s 86.88s
93 100.00% 99.70% 95.19% 94.65%
94 -fnew-codegen -fregs-iterative 206.72s 343.632s 194.694s 208.677s
95 100.00% 166.23% 94.18% 100.95%
96 -}
97
98 trivColorable
99 :: Platform
100 -> (RegClass -> VirtualReg -> Int)
101 -> (RegClass -> RealReg -> Int)
102 -> Triv VirtualReg RegClass RealReg
103
104 trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions
105 | let cALLOCATABLE_REGS_INTEGER
106 = (case platformArch platform of
107 ArchX86 -> 3
108 ArchX86_64 -> 5
109 ArchPPC -> 16
110 ArchSPARC -> 14
111 ArchSPARC64 -> panic "trivColorable ArchSPARC64"
112 ArchPPC_64 _ -> 15
113 ArchARM _ _ _ -> panic "trivColorable ArchARM"
114 -- We should be able to allocate *a lot* more in princple.
115 -- essentially all 32 - SP, so 31, we'd trash the link reg
116 -- as well as the platform and all others though.
117 ArchAArch64 -> 18
118 ArchAlpha -> panic "trivColorable ArchAlpha"
119 ArchMipseb -> panic "trivColorable ArchMipseb"
120 ArchMipsel -> panic "trivColorable ArchMipsel"
121 ArchS390X -> panic "trivColorable ArchS390X"
122 ArchRISCV64 -> panic "trivColorable ArchRISCV64"
123 ArchJavaScript-> panic "trivColorable ArchJavaScript"
124 ArchUnknown -> panic "trivColorable ArchUnknown")
125 , count2 <- accSqueeze 0 cALLOCATABLE_REGS_INTEGER
126 (virtualRegSqueeze RcInteger)
127 conflicts
128
129 , count3 <- accSqueeze count2 cALLOCATABLE_REGS_INTEGER
130 (realRegSqueeze RcInteger)
131 exclusions
132
133 = count3 < cALLOCATABLE_REGS_INTEGER
134
135 trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
136 | let cALLOCATABLE_REGS_FLOAT
137 = (case platformArch platform of
138 -- On x86_64 and x86, Float and RcDouble
139 -- use the same registers,
140 -- so we only use RcDouble to represent the
141 -- register allocation problem on those types.
142 ArchX86 -> 0
143 ArchX86_64 -> 0
144 ArchPPC -> 0
145 ArchSPARC -> 22
146 ArchSPARC64 -> panic "trivColorable ArchSPARC64"
147 ArchPPC_64 _ -> 0
148 ArchARM _ _ _ -> panic "trivColorable ArchARM"
149 -- we can in princple address all the float regs as
150 -- segments. So we could have 64 Float regs. Or
151 -- 128 Half regs, or even 256 Byte regs.
152 ArchAArch64 -> 0
153 ArchAlpha -> panic "trivColorable ArchAlpha"
154 ArchMipseb -> panic "trivColorable ArchMipseb"
155 ArchMipsel -> panic "trivColorable ArchMipsel"
156 ArchS390X -> panic "trivColorable ArchS390X"
157 ArchRISCV64 -> panic "trivColorable ArchRISCV64"
158 ArchJavaScript-> panic "trivColorable ArchJavaScript"
159 ArchUnknown -> panic "trivColorable ArchUnknown")
160 , count2 <- accSqueeze 0 cALLOCATABLE_REGS_FLOAT
161 (virtualRegSqueeze RcFloat)
162 conflicts
163
164 , count3 <- accSqueeze count2 cALLOCATABLE_REGS_FLOAT
165 (realRegSqueeze RcFloat)
166 exclusions
167
168 = count3 < cALLOCATABLE_REGS_FLOAT
169
170 trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
171 | let cALLOCATABLE_REGS_DOUBLE
172 = (case platformArch platform of
173 ArchX86 -> 8
174 -- in x86 32bit mode sse2 there are only
175 -- 8 XMM registers xmm0 ... xmm7
176 ArchX86_64 -> 10
177 -- in x86_64 there are 16 XMM registers
178 -- xmm0 .. xmm15, here 10 is a
179 -- "dont need to solve conflicts" count that
180 -- was chosen at some point in the past.
181 ArchPPC -> 26
182 ArchSPARC -> 11
183 ArchSPARC64 -> panic "trivColorable ArchSPARC64"
184 ArchPPC_64 _ -> 20
185 ArchARM _ _ _ -> panic "trivColorable ArchARM"
186 ArchAArch64 -> 32
187 ArchAlpha -> panic "trivColorable ArchAlpha"
188 ArchMipseb -> panic "trivColorable ArchMipseb"
189 ArchMipsel -> panic "trivColorable ArchMipsel"
190 ArchS390X -> panic "trivColorable ArchS390X"
191 ArchRISCV64 -> panic "trivColorable ArchRISCV64"
192 ArchJavaScript-> panic "trivColorable ArchJavaScript"
193 ArchUnknown -> panic "trivColorable ArchUnknown")
194 , count2 <- accSqueeze 0 cALLOCATABLE_REGS_DOUBLE
195 (virtualRegSqueeze RcDouble)
196 conflicts
197
198 , count3 <- accSqueeze count2 cALLOCATABLE_REGS_DOUBLE
199 (realRegSqueeze RcDouble)
200 exclusions
201
202 = count3 < cALLOCATABLE_REGS_DOUBLE
203
204
205
206
207 -- Specification Code ----------------------------------------------------------
208 --
209 -- The trivColorable function for each particular architecture should
210 -- implement the following function, but faster.
211 --
212
213 {-
214 trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
215 trivColorable classN conflicts exclusions
216 = let
217
218 acc :: Reg -> (Int, Int) -> (Int, Int)
219 acc r (cd, cf)
220 = case regClass r of
221 RcInteger -> (cd+1, cf)
222 RcFloat -> (cd, cf+1)
223 _ -> panic "Regs.trivColorable: reg class not handled"
224
225 tmp = nonDetFoldUFM acc (0, 0) conflicts
226 (countInt, countFloat) = nonDetFoldUFM acc tmp exclusions
227
228 squeese = worst countInt classN RcInteger
229 + worst countFloat classN RcFloat
230
231 in squeese < allocatableRegsInClass classN
232
233 -- | Worst case displacement
234 -- node N of classN has n neighbors of class C.
235 --
236 -- We currently only have RcInteger and RcDouble, which don't conflict at all.
237 -- This is a bit boring compared to what's in RegArchX86.
238 --
239 worst :: Int -> RegClass -> RegClass -> Int
240 worst n classN classC
241 = case classN of
242 RcInteger
243 -> case classC of
244 RcInteger -> min n (allocatableRegsInClass RcInteger)
245 RcFloat -> 0
246
247 RcDouble
248 -> case classC of
249 RcFloat -> min n (allocatableRegsInClass RcFloat)
250 RcInteger -> 0
251
252 -- allocatableRegs is allMachRegNos with the fixed-use regs removed.
253 -- i.e., these are the regs for which we are prepared to allow the
254 -- register allocator to attempt to map VRegs to.
255 allocatableRegs :: [RegNo]
256 allocatableRegs
257 = let isFree i = freeReg i
258 in filter isFree allMachRegNos
259
260
261 -- | The number of regs in each class.
262 -- We go via top level CAFs to ensure that we're not recomputing
263 -- the length of these lists each time the fn is called.
264 allocatableRegsInClass :: RegClass -> Int
265 allocatableRegsInClass cls
266 = case cls of
267 RcInteger -> allocatableRegsInteger
268 RcFloat -> allocatableRegsDouble
269
270 allocatableRegsInteger :: Int
271 allocatableRegsInteger
272 = length $ filter (\r -> regClass r == RcInteger)
273 $ map RealReg allocatableRegs
274
275 allocatableRegsFloat :: Int
276 allocatableRegsFloat
277 = length $ filter (\r -> regClass r == RcFloat
278 $ map RealReg allocatableRegs
279 -}