never executed always true always false
    1 module GHC.CmmToAsm.Reg.Graph.TrivColorable (
    2         trivColorable,
    3 )
    4 
    5 where
    6 
    7 import GHC.Prelude
    8 
    9 import GHC.Platform.Reg.Class
   10 import GHC.Platform.Reg
   11 
   12 import GHC.Data.Graph.Base
   13 
   14 import GHC.Types.Unique.Set
   15 import GHC.Platform
   16 import GHC.Utils.Panic
   17 
   18 -- trivColorable ---------------------------------------------------------------
   19 
   20 -- trivColorable function for the graph coloring allocator
   21 --
   22 --      This gets hammered by scanGraph during register allocation,
   23 --      so needs to be fairly efficient.
   24 --
   25 --      NOTE:   This only works for architectures with just RcInteger and RcDouble
   26 --              (which are disjoint) ie. x86, x86_64 and ppc
   27 --
   28 --      The number of allocatable regs is hard coded in here so we can do
   29 --              a fast comparison in trivColorable.
   30 --
   31 --      It's ok if these numbers are _less_ than the actual number of free
   32 --              regs, but they can't be more or the register conflict
   33 --              graph won't color.
   34 --
   35 --      If the graph doesn't color then the allocator will panic, but it won't
   36 --              generate bad object code or anything nasty like that.
   37 --
   38 --      There is an allocatableRegsInClass :: RegClass -> Int, but doing
   39 --      the unboxing is too slow for us here.
   40 --      TODO: Is that still true? Could we use allocatableRegsInClass
   41 --      without losing performance now?
   42 --
   43 --      Look at rts/include/stg/MachRegs.h to get the numbers.
   44 --
   45 
   46 
   47 -- Disjoint registers ----------------------------------------------------------
   48 --
   49 --      The definition has been unfolded into individual cases for speed.
   50 --      Each architecture has a different register setup, so we use a
   51 --      different regSqueeze function for each.
   52 --
   53 accSqueeze
   54         :: Int
   55         -> Int
   56         -> (reg -> Int)
   57         -> UniqSet reg
   58         -> Int
   59 
   60 accSqueeze count maxCount squeeze us = acc count (nonDetEltsUniqSet us)
   61   -- See Note [Unique Determinism and code generation]
   62   where acc count [] = count
   63         acc count _ | count >= maxCount = count
   64         acc count (r:rs) = acc (count + squeeze r) rs
   65 
   66 {- Note [accSqueeze]
   67 ~~~~~~~~~~~~~~~~~~~~
   68 BL 2007/09
   69 Doing a nice fold over the UniqSet makes trivColorable use
   70 32% of total compile time and 42% of total alloc when compiling SHA1.hs from darcs.
   71 Therefore the UniqFM is made non-abstract and we use custom fold.
   72 
   73 MS 2010/04
   74 When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal
   75 representation any more. But it is imperative that the accSqueeze stops
   76 the folding if the count gets greater or equal to maxCount. We thus convert
   77 UniqFM to a (lazy) list, do the fold and stops if necessary, which was
   78 the most efficient variant tried. Benchmark compiling 10-times SHA1.hs follows.
   79 (original = previous implementation, folding = fold of the whole UFM,
   80  lazyFold = the current implementation,
   81  hackFold = using internal representation of Data.IntMap)
   82 
   83                                  original  folding   hackFold  lazyFold
   84  -O -fasm (used everywhere)      31.509s   30.387s   30.791s   30.603s
   85                                  100.00%   96.44%    97.72%    97.12%
   86  -fregs-graph                    67.938s   74.875s   62.673s   64.679s
   87                                  100.00%   110.21%   92.25%    95.20%
   88  -fregs-iterative                89.761s   143.913s  81.075s   86.912s
   89                                  100.00%   160.33%   90.32%    96.83%
   90  -fnew-codegen                   38.225s   37.142s   37.551s   37.119s
   91                                  100.00%   97.17%    98.24%    97.11%
   92  -fnew-codegen -fregs-graph      91.786s   91.51s    87.368s   86.88s
   93                                  100.00%   99.70%    95.19%    94.65%
   94  -fnew-codegen -fregs-iterative  206.72s   343.632s  194.694s  208.677s
   95                                  100.00%   166.23%   94.18%    100.95%
   96 -}
   97 
   98 trivColorable
   99         :: Platform
  100         -> (RegClass -> VirtualReg -> Int)
  101         -> (RegClass -> RealReg    -> Int)
  102         -> Triv VirtualReg RegClass RealReg
  103 
  104 trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions
  105         | let cALLOCATABLE_REGS_INTEGER
  106                   =        (case platformArch platform of
  107                             ArchX86       -> 3
  108                             ArchX86_64    -> 5
  109                             ArchPPC       -> 16
  110                             ArchSPARC     -> 14
  111                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
  112                             ArchPPC_64 _  -> 15
  113                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
  114                             -- We should be able to allocate *a lot* more in princple.
  115                             -- essentially all 32 - SP, so 31, we'd trash the link reg
  116                             -- as well as the platform and all others though.
  117                             ArchAArch64   -> 18
  118                             ArchAlpha     -> panic "trivColorable ArchAlpha"
  119                             ArchMipseb    -> panic "trivColorable ArchMipseb"
  120                             ArchMipsel    -> panic "trivColorable ArchMipsel"
  121                             ArchS390X     -> panic "trivColorable ArchS390X"
  122                             ArchRISCV64   -> panic "trivColorable ArchRISCV64"
  123                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
  124                             ArchUnknown   -> panic "trivColorable ArchUnknown")
  125         , count2        <- accSqueeze 0 cALLOCATABLE_REGS_INTEGER
  126                                 (virtualRegSqueeze RcInteger)
  127                                 conflicts
  128 
  129         , count3        <- accSqueeze  count2    cALLOCATABLE_REGS_INTEGER
  130                                 (realRegSqueeze   RcInteger)
  131                                 exclusions
  132 
  133         = count3 < cALLOCATABLE_REGS_INTEGER
  134 
  135 trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
  136         | let cALLOCATABLE_REGS_FLOAT
  137                   =        (case platformArch platform of
  138                     -- On x86_64 and x86, Float and RcDouble
  139                     -- use the same registers,
  140                     -- so we only use RcDouble to represent the
  141                     -- register allocation problem on those types.
  142                             ArchX86       -> 0
  143                             ArchX86_64    -> 0
  144                             ArchPPC       -> 0
  145                             ArchSPARC     -> 22
  146                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
  147                             ArchPPC_64 _  -> 0
  148                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
  149                             -- we can in princple address all the float regs as
  150                             -- segments. So we could have 64 Float regs. Or
  151                             -- 128 Half regs, or even 256 Byte regs.
  152                             ArchAArch64   -> 0
  153                             ArchAlpha     -> panic "trivColorable ArchAlpha"
  154                             ArchMipseb    -> panic "trivColorable ArchMipseb"
  155                             ArchMipsel    -> panic "trivColorable ArchMipsel"
  156                             ArchS390X     -> panic "trivColorable ArchS390X"
  157                             ArchRISCV64   -> panic "trivColorable ArchRISCV64"
  158                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
  159                             ArchUnknown   -> panic "trivColorable ArchUnknown")
  160         , count2        <- accSqueeze 0 cALLOCATABLE_REGS_FLOAT
  161                                 (virtualRegSqueeze RcFloat)
  162                                 conflicts
  163 
  164         , count3        <- accSqueeze  count2    cALLOCATABLE_REGS_FLOAT
  165                                 (realRegSqueeze   RcFloat)
  166                                 exclusions
  167 
  168         = count3 < cALLOCATABLE_REGS_FLOAT
  169 
  170 trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
  171         | let cALLOCATABLE_REGS_DOUBLE
  172                   =        (case platformArch platform of
  173                             ArchX86       -> 8
  174                             -- in x86 32bit mode sse2 there are only
  175                             -- 8 XMM registers xmm0 ... xmm7
  176                             ArchX86_64    -> 10
  177                             -- in x86_64 there are 16 XMM registers
  178                             -- xmm0 .. xmm15, here 10 is a
  179                             -- "dont need to solve conflicts" count that
  180                             -- was chosen at some point in the past.
  181                             ArchPPC       -> 26
  182                             ArchSPARC     -> 11
  183                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
  184                             ArchPPC_64 _  -> 20
  185                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
  186                             ArchAArch64   -> 32
  187                             ArchAlpha     -> panic "trivColorable ArchAlpha"
  188                             ArchMipseb    -> panic "trivColorable ArchMipseb"
  189                             ArchMipsel    -> panic "trivColorable ArchMipsel"
  190                             ArchS390X     -> panic "trivColorable ArchS390X"
  191                             ArchRISCV64   -> panic "trivColorable ArchRISCV64"
  192                             ArchJavaScript-> panic "trivColorable ArchJavaScript"
  193                             ArchUnknown   -> panic "trivColorable ArchUnknown")
  194         , count2        <- accSqueeze 0 cALLOCATABLE_REGS_DOUBLE
  195                                 (virtualRegSqueeze RcDouble)
  196                                 conflicts
  197 
  198         , count3        <- accSqueeze  count2    cALLOCATABLE_REGS_DOUBLE
  199                                 (realRegSqueeze   RcDouble)
  200                                 exclusions
  201 
  202         = count3 < cALLOCATABLE_REGS_DOUBLE
  203 
  204 
  205 
  206 
  207 -- Specification Code ----------------------------------------------------------
  208 --
  209 --      The trivColorable function for each particular architecture should
  210 --      implement the following function, but faster.
  211 --
  212 
  213 {-
  214 trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
  215 trivColorable classN conflicts exclusions
  216  = let
  217 
  218         acc :: Reg -> (Int, Int) -> (Int, Int)
  219         acc r (cd, cf)
  220          = case regClass r of
  221                 RcInteger       -> (cd+1, cf)
  222                 RcFloat         -> (cd,   cf+1)
  223                 _               -> panic "Regs.trivColorable: reg class not handled"
  224 
  225         tmp                     = nonDetFoldUFM acc (0, 0) conflicts
  226         (countInt,  countFloat) = nonDetFoldUFM acc tmp    exclusions
  227 
  228         squeese         = worst countInt   classN RcInteger
  229                         + worst countFloat classN RcFloat
  230 
  231    in   squeese < allocatableRegsInClass classN
  232 
  233 -- | Worst case displacement
  234 --      node N of classN has n neighbors of class C.
  235 --
  236 --      We currently only have RcInteger and RcDouble, which don't conflict at all.
  237 --      This is a bit boring compared to what's in RegArchX86.
  238 --
  239 worst :: Int -> RegClass -> RegClass -> Int
  240 worst n classN classC
  241  = case classN of
  242         RcInteger
  243          -> case classC of
  244                 RcInteger       -> min n (allocatableRegsInClass RcInteger)
  245                 RcFloat         -> 0
  246 
  247         RcDouble
  248          -> case classC of
  249                 RcFloat         -> min n (allocatableRegsInClass RcFloat)
  250                 RcInteger       -> 0
  251 
  252 -- allocatableRegs is allMachRegNos with the fixed-use regs removed.
  253 -- i.e., these are the regs for which we are prepared to allow the
  254 -- register allocator to attempt to map VRegs to.
  255 allocatableRegs :: [RegNo]
  256 allocatableRegs
  257    = let isFree i = freeReg i
  258      in  filter isFree allMachRegNos
  259 
  260 
  261 -- | The number of regs in each class.
  262 --      We go via top level CAFs to ensure that we're not recomputing
  263 --      the length of these lists each time the fn is called.
  264 allocatableRegsInClass :: RegClass -> Int
  265 allocatableRegsInClass cls
  266  = case cls of
  267         RcInteger       -> allocatableRegsInteger
  268         RcFloat         -> allocatableRegsDouble
  269 
  270 allocatableRegsInteger :: Int
  271 allocatableRegsInteger
  272         = length $ filter (\r -> regClass r == RcInteger)
  273                  $ map RealReg allocatableRegs
  274 
  275 allocatableRegsFloat :: Int
  276 allocatableRegsFloat
  277         = length $ filter (\r -> regClass r == RcFloat
  278                  $ map RealReg allocatableRegs
  279 -}