never executed always true always false

    1 {-# LANGUAGE BangPatterns #-}
    2 {-# LANGUAGE GADTs #-}
    3 {-# LANGUAGE NondecreasingIndentation #-}
    4 {-# LANGUAGE TupleSections #-}
    5 
    6 {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
    7 
    8 -----------------------------------------------------------------------------
    9 --
   10 -- Generating machine code (instruction selection)
   11 --
   12 -- (c) The University of Glasgow 1996-2004
   13 --
   14 -----------------------------------------------------------------------------
   15 
   16 -- This is a big module, but, if you pay attention to
   17 -- (a) the sectioning, and (b) the type signatures, the
   18 -- structure should not be too overwhelming.
   19 
   20 module GHC.CmmToAsm.X86.CodeGen (
   21         cmmTopCodeGen,
   22         generateJumpTableForInstr,
   23         extractUnwindPoints,
   24         invertCondBranches,
   25         InstrBlock
   26 )
   27 
   28 where
   29 
   30 -- NCG stuff:
   31 import GHC.Prelude
   32 
   33 import GHC.CmmToAsm.X86.Instr
   34 import GHC.CmmToAsm.X86.Cond
   35 import GHC.CmmToAsm.X86.Regs
   36 import GHC.CmmToAsm.X86.Ppr
   37 import GHC.CmmToAsm.X86.RegInfo
   38 
   39 import GHC.Platform.Regs
   40 import GHC.CmmToAsm.CPrim
   41 import GHC.CmmToAsm.Types
   42 import GHC.Cmm.DebugBlock
   43    ( DebugBlock(..), UnwindPoint(..), UnwindTable
   44    , UnwindExpr(UwReg), toUnwindExpr
   45    )
   46 import GHC.CmmToAsm.PIC
   47 import GHC.CmmToAsm.Monad
   48    ( NatM, getNewRegNat, getNewLabelNat, setDeltaNat
   49    , getDeltaNat, getBlockIdNat, getPicBaseNat, getNewRegPairNat
   50    , getPicBaseMaybeNat, getDebugBlock, getFileId
   51    , addImmediateSuccessorNat, updateCfgNat, getConfig, getPlatform
   52    , getCfgWeights
   53    )
   54 import GHC.CmmToAsm.CFG
   55 import GHC.CmmToAsm.Format
   56 import GHC.CmmToAsm.Config
   57 import GHC.Platform.Reg
   58 import GHC.Platform
   59 
   60 -- Our intermediate code:
   61 import GHC.Types.Basic
   62 import GHC.Cmm.BlockId
   63 import GHC.Unit.Types ( primUnitId )
   64 import GHC.Cmm.Utils
   65 import GHC.Cmm.Switch
   66 import GHC.Cmm
   67 import GHC.Cmm.Dataflow.Block
   68 import GHC.Cmm.Dataflow.Collections
   69 import GHC.Cmm.Dataflow.Graph
   70 import GHC.Cmm.Dataflow.Label
   71 import GHC.Cmm.CLabel
   72 import GHC.Types.Tickish ( GenTickish(..) )
   73 import GHC.Types.SrcLoc  ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
   74 
   75 -- The rest:
   76 import GHC.Types.ForeignCall ( CCallConv(..) )
   77 import GHC.Data.OrdList
   78 import GHC.Utils.Outputable
   79 import GHC.Utils.Constants (debugIsOn)
   80 import GHC.Utils.Panic
   81 import GHC.Utils.Panic.Plain
   82 import GHC.Data.FastString
   83 import GHC.Driver.Session
   84 import GHC.Utils.Misc
   85 import GHC.Types.Unique.Supply ( getUniqueM )
   86 
   87 import Control.Monad
   88 import Data.Foldable (fold)
   89 import Data.Int
   90 import Data.Maybe
   91 import Data.Word
   92 
   93 import qualified Data.Map as M
   94 
   95 is32BitPlatform :: NatM Bool
   96 is32BitPlatform = do
   97     platform <- getPlatform
   98     return $ target32Bit platform
   99 
  100 sse2Enabled :: NatM Bool
  101 sse2Enabled = do
  102   config <- getConfig
  103   return (ncgSseVersion config >= Just SSE2)
  104 
  105 sse4_2Enabled :: NatM Bool
  106 sse4_2Enabled = do
  107   config <- getConfig
  108   return (ncgSseVersion config >= Just SSE42)
  109 
  110 cmmTopCodeGen
  111         :: RawCmmDecl
  112         -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr]
  113 
  114 cmmTopCodeGen (CmmProc info lab live graph) = do
  115   let blocks = toBlockListEntryFirst graph
  116   (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
  117   picBaseMb <- getPicBaseMaybeNat
  118   platform <- getPlatform
  119   let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
  120       tops = proc : concat statics
  121       os   = platformOS platform
  122 
  123   case picBaseMb of
  124       Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
  125       Nothing -> return tops
  126 
  127 cmmTopCodeGen (CmmData sec dat) =
  128   return [CmmData sec (mkAlignment 1, dat)]  -- no translation, we just use CmmStatic
  129 
  130 {- Note [Verifying basic blocks]
  131    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  132 
  133    We want to guarantee a few things about the results
  134    of instruction selection.
  135 
  136    Namely that each basic blocks consists of:
  137     * A (potentially empty) sequence of straight line instructions
  138   followed by
  139     * A (potentially empty) sequence of jump like instructions.
  140 
  141     We can verify this by going through the instructions and
  142     making sure that any non-jumpish instruction can't appear
  143     after a jumpish instruction.
  144 
  145     There are gotchas however:
  146     * CALLs are strictly speaking control flow but here we care
  147       not about them. Hence we treat them as regular instructions.
  148 
  149       It's safe for them to appear inside a basic block
  150       as (ignoring side effects inside the call) they will result in
  151       straight line code.
  152 
  153     * NEWBLOCK marks the start of a new basic block so can
  154       be followed by any instructions.
  155 -}
  156 
  157 -- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally.
  158 verifyBasicBlock :: Platform -> [Instr] -> ()
  159 verifyBasicBlock platform instrs
  160   | debugIsOn     = go False instrs
  161   | otherwise     = ()
  162   where
  163     go _     [] = ()
  164     go atEnd (i:instr)
  165         = case i of
  166             -- Start a new basic block
  167             NEWBLOCK {} -> go False instr
  168             -- Calls are not viable block terminators
  169             CALL {}     | atEnd -> faultyBlockWith i
  170                         | not atEnd -> go atEnd instr
  171             -- All instructions ok, check if we reached the end and continue.
  172             _ | not atEnd -> go (isJumpishInstr i) instr
  173               -- Only jumps allowed at the end of basic blocks.
  174               | otherwise -> if isJumpishInstr i
  175                                 then go True instr
  176                                 else faultyBlockWith i
  177     faultyBlockWith i
  178         = pprPanic "Non control flow instructions after end of basic block."
  179                    (pprInstr platform i <+> text "in:" $$ vcat (map (pprInstr platform) instrs))
  180 
  181 basicBlockCodeGen
  182         :: CmmBlock
  183         -> NatM ( [NatBasicBlock Instr]
  184                 , [NatCmmDecl (Alignment, RawCmmStatics) Instr])
  185 
  186 basicBlockCodeGen block = do
  187   let (_, nodes, tail)  = blockSplit block
  188       id = entryLabel block
  189       stmts = blockToList nodes
  190   -- Generate location directive
  191   dbg <- getDebugBlock (entryLabel block)
  192   loc_instrs <- case dblSourceTick =<< dbg of
  193     Just (SourceNote span name)
  194       -> do fileId <- getFileId (srcSpanFile span)
  195             let line = srcSpanStartLine span; col = srcSpanStartCol span
  196             return $ unitOL $ LOCATION fileId line col name
  197     _ -> return nilOL
  198   (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
  199   (!tail_instrs,_) <- stmtToInstrs mid_bid tail
  200   let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
  201   platform <- getPlatform
  202   return $! verifyBasicBlock platform (fromOL instrs)
  203   instrs' <- fold <$> traverse addSpUnwindings instrs
  204   -- code generation may introduce new basic block boundaries, which
  205   -- are indicated by the NEWBLOCK instruction.  We must split up the
  206   -- instruction stream into basic blocks again.  Also, we extract
  207   -- LDATAs here too.
  208   let
  209         (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs'
  210 
  211         mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
  212           = ([], BasicBlock id instrs : blocks, statics)
  213         mkBlocks (LDATA sec dat) (instrs,blocks,statics)
  214           = (instrs, blocks, CmmData sec dat:statics)
  215         mkBlocks instr (instrs,blocks,statics)
  216           = (instr:instrs, blocks, statics)
  217   return (BasicBlock id top : other_blocks, statics)
  218 
  219 -- | Convert 'DELTA' instructions into 'UNWIND' instructions to capture changes
  220 -- in the @sp@ register. See Note [What is this unwinding business?] in "GHC.Cmm.DebugBlock"
  221 -- for details.
  222 addSpUnwindings :: Instr -> NatM (OrdList Instr)
  223 addSpUnwindings instr@(DELTA d) = do
  224     config <- getConfig
  225     if ncgDwarfUnwindings config
  226         then do lbl <- mkAsmTempLabel <$> getUniqueM
  227                 let unwind = M.singleton MachSp (Just $ UwReg MachSp $ negate d)
  228                 return $ toOL [ instr, UNWIND lbl unwind ]
  229         else return (unitOL instr)
  230 addSpUnwindings instr = return $ unitOL instr
  231 
  232 {- Note [Keeping track of the current block]
  233    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  234 
  235 When generating instructions for Cmm we sometimes require
  236 the current block for things like retry loops.
  237 
  238 We also sometimes change the current block, if a MachOP
  239 results in branching control flow.
  240 
  241 Issues arise if we have two statements in the same block,
  242 which both depend on the current block id *and* change the
  243 basic block after them. This happens for atomic primops
  244 in the X86 backend where we want to update the CFG data structure
  245 when introducing new basic blocks.
  246 
  247 For example in #17334 we got this Cmm code:
  248 
  249         c3Bf: // global
  250             (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18);
  251             (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0);
  252             _s3sT::I64 = _s3sV::I64;
  253             goto c3B1;
  254 
  255 This resulted in two new basic blocks being inserted:
  256 
  257         c3Bf:
  258                 movl $18,%vI_n3Bo
  259                 movq 88(%vI_s3sQ),%rax
  260                 jmp _n3Bp
  261         n3Bp:
  262                 ...
  263                 cmpxchgq %vI_n3Bq,88(%vI_s3sQ)
  264                 jne _n3Bp
  265                 ...
  266                 jmp _n3Bs
  267         n3Bs:
  268                 ...
  269                 cmpxchgq %vI_n3Bt,88(%vI_s3sQ)
  270                 jne _n3Bs
  271                 ...
  272                 jmp _c3B1
  273         ...
  274 
  275 Based on the Cmm we called stmtToInstrs we translated both atomic operations under
  276 the assumption they would be placed into their Cmm basic block `c3Bf`.
  277 However for the retry loop we introduce new labels, so this is not the case
  278 for the second statement.
  279 This resulted in a desync between the explicit control flow graph
  280 we construct as a separate data type and the actual control flow graph in the code.
  281 
  282 Instead we now return the new basic block if a statement causes a change
  283 in the current block and use the block for all following statements.
  284 
  285 For this reason genCCall is also split into two parts.  One for calls which
  286 *won't* change the basic blocks in which successive instructions will be
  287 placed (since they only evaluate CmmExpr, which can only contain MachOps, which
  288 cannot introduce basic blocks in their lowerings).  A different one for calls
  289 which *are* known to change the basic block.
  290 
  291 -}
  292 
  293 -- See Note [Keeping track of the current block] for why
  294 -- we pass the BlockId.
  295 stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
  296               -> [CmmNode O O] -- ^ Cmm Statement
  297               -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
  298 stmtsToInstrs bid stmts =
  299     go bid stmts nilOL
  300   where
  301     go bid  []        instrs = return (instrs,bid)
  302     go bid (s:stmts)  instrs = do
  303       (instrs',bid') <- stmtToInstrs bid s
  304       -- If the statement introduced a new block, we use that one
  305       let !newBid = fromMaybe bid bid'
  306       go newBid stmts (instrs `appOL` instrs')
  307 
  308 -- | `bid` refers to the current block and is used to update the CFG
  309 --   if new blocks are inserted in the control flow.
  310 -- See Note [Keeping track of the current block] for more details.
  311 stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
  312              -> CmmNode e x
  313              -> NatM (InstrBlock, Maybe BlockId)
  314              -- ^ Instructions, and bid of new block if successive
  315              -- statements are placed in a different basic block.
  316 stmtToInstrs bid stmt = do
  317   is32Bit <- is32BitPlatform
  318   platform <- getPlatform
  319   case stmt of
  320     CmmUnsafeForeignCall target result_regs args
  321        -> genCCall is32Bit target result_regs args bid
  322 
  323     _ -> (,Nothing) <$> case stmt of
  324       CmmComment s   -> return (unitOL (COMMENT $ ftext s))
  325       CmmTick {}     -> return nilOL
  326 
  327       CmmUnwind regs -> do
  328         let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
  329             to_unwind_entry (reg, expr) = M.singleton reg (fmap (toUnwindExpr platform) expr)
  330         case foldMap to_unwind_entry regs of
  331           tbl | M.null tbl -> return nilOL
  332               | otherwise  -> do
  333                   lbl <- mkAsmTempLabel <$> getUniqueM
  334                   return $ unitOL $ UNWIND lbl tbl
  335 
  336       CmmAssign reg src
  337         | isFloatType ty         -> assignReg_FltCode format reg src
  338         | is32Bit && isWord64 ty -> assignReg_I64Code      reg src
  339         | otherwise              -> assignReg_IntCode format reg src
  340           where ty = cmmRegType platform reg
  341                 format = cmmTypeFormat ty
  342 
  343       CmmStore addr src
  344         | isFloatType ty         -> assignMem_FltCode format addr src
  345         | is32Bit && isWord64 ty -> assignMem_I64Code      addr src
  346         | otherwise              -> assignMem_IntCode format addr src
  347           where ty = cmmExprType platform src
  348                 format = cmmTypeFormat ty
  349 
  350       CmmBranch id          -> return $ genBranch id
  351 
  352       --We try to arrange blocks such that the likely branch is the fallthrough
  353       --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
  354       CmmCondBranch arg true false _ -> genCondBranch bid true false arg
  355       CmmSwitch arg ids -> genSwitch arg ids
  356       CmmCall { cml_target = arg
  357               , cml_args_regs = gregs } -> genJump arg (jumpRegs platform gregs)
  358       _ ->
  359         panic "stmtToInstrs: statement should have been cps'd away"
  360 
  361 
  362 jumpRegs :: Platform -> [GlobalReg] -> [Reg]
  363 jumpRegs platform gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
  364 
  365 --------------------------------------------------------------------------------
  366 -- | 'InstrBlock's are the insn sequences generated by the insn selectors.
  367 --      They are really trees of insns to facilitate fast appending, where a
  368 --      left-to-right traversal yields the insns in the correct order.
  369 --
  370 type InstrBlock
  371         = OrdList Instr
  372 
  373 
  374 -- | Condition codes passed up the tree.
  375 --
  376 data CondCode
  377         = CondCode Bool Cond InstrBlock
  378 
  379 
  380 -- | a.k.a "Register64"
  381 --      Reg is the lower 32-bit temporary which contains the result.
  382 --      Use getHiVRegFromLo to find the other VRegUnique.
  383 --
  384 --      Rules of this simplified insn selection game are therefore that
  385 --      the returned Reg may be modified
  386 --
  387 data ChildCode64
  388    = ChildCode64
  389         InstrBlock
  390         Reg
  391 
  392 
  393 -- | Register's passed up the tree.  If the stix code forces the register
  394 --      to live in a pre-decided machine register, it comes out as @Fixed@;
  395 --      otherwise, it comes out as @Any@, and the parent can decide which
  396 --      register to put it in.
  397 --
  398 data Register
  399         = Fixed Format Reg InstrBlock
  400         | Any   Format (Reg -> InstrBlock)
  401 
  402 
  403 swizzleRegisterRep :: Register -> Format -> Register
  404 swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
  405 swizzleRegisterRep (Any _ codefn)     format = Any   format codefn
  406 
  407 
  408 -- | Grab the Reg for a CmmReg
  409 getRegisterReg :: Platform  -> CmmReg -> Reg
  410 
  411 getRegisterReg _   (CmmLocal (LocalReg u pk))
  412   = -- by Assuming SSE2, Int,Word,Float,Double all can be register allocated
  413    let fmt = cmmTypeFormat pk in
  414         RegVirtual (mkVirtualReg u fmt)
  415 
  416 getRegisterReg platform  (CmmGlobal mid)
  417   = case globalRegMaybe platform mid of
  418         Just reg -> RegReal $ reg
  419         Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
  420         -- By this stage, the only MagicIds remaining should be the
  421         -- ones which map to a real machine register on this
  422         -- platform.  Hence ...
  423 
  424 
  425 -- | Memory addressing modes passed up the tree.
  426 data Amode
  427         = Amode AddrMode InstrBlock
  428 
  429 {-
  430 Now, given a tree (the argument to a CmmLoad) that references memory,
  431 produce a suitable addressing mode.
  432 
  433 A Rule of the Game (tm) for Amodes: use of the addr bit must
  434 immediately follow use of the code part, since the code part puts
  435 values in registers which the addr then refers to.  So you can't put
  436 anything in between, lest it overwrite some of those registers.  If
  437 you need to do some other computation between the code part and use of
  438 the addr bit, first store the effective address from the amode in a
  439 temporary, then do the other computation, and then use the temporary:
  440 
  441     code
  442     LEA amode, tmp
  443     ... other computation ...
  444     ... (tmp) ...
  445 -}
  446 
  447 
  448 -- | Check whether an integer will fit in 32 bits.
  449 --      A CmmInt is intended to be truncated to the appropriate
  450 --      number of bits, so here we truncate it to Int64.  This is
  451 --      important because e.g. -1 as a CmmInt might be either
  452 --      -1 or 18446744073709551615.
  453 --
  454 is32BitInteger :: Integer -> Bool
  455 is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
  456   where i64 = fromIntegral i :: Int64
  457 
  458 
  459 -- | Convert a BlockId to some CmmStatic data
  460 jumpTableEntry :: NCGConfig -> Maybe BlockId -> CmmStatic
  461 jumpTableEntry config Nothing = CmmStaticLit (CmmInt 0 (ncgWordWidth config))
  462 jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
  463     where blockLabel = blockLbl blockid
  464 
  465 
  466 -- -----------------------------------------------------------------------------
  467 -- General things for putting together code sequences
  468 
  469 -- Expand CmmRegOff.  ToDo: should we do it this way around, or convert
  470 -- CmmExprs into CmmRegOff?
  471 mangleIndexTree :: Platform -> CmmReg -> Int -> CmmExpr
  472 mangleIndexTree platform reg off
  473   = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
  474   where width = typeWidth (cmmRegType platform reg)
  475 
  476 -- | The dual to getAnyReg: compute an expression into a register, but
  477 --      we don't mind which one it is.
  478 getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
  479 getSomeReg expr = do
  480   r <- getRegister expr
  481   case r of
  482     Any rep code -> do
  483         tmp <- getNewRegNat rep
  484         return (tmp, code tmp)
  485     Fixed _ reg code ->
  486         return (reg, code)
  487 
  488 
  489 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
  490 assignMem_I64Code addrTree valueTree = do
  491   Amode addr addr_code <- getAmode addrTree
  492   ChildCode64 vcode rlo <- iselExpr64 valueTree
  493   let
  494         rhi = getHiVRegFromLo rlo
  495 
  496         -- Little-endian store
  497         mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
  498         mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
  499   return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
  500 
  501 
  502 assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
  503 assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
  504    ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
  505    let
  506          r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
  507          r_dst_hi = getHiVRegFromLo r_dst_lo
  508          r_src_hi = getHiVRegFromLo r_src_lo
  509          mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
  510          mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
  511    return (
  512         vcode `snocOL` mov_lo `snocOL` mov_hi
  513      )
  514 
  515 assignReg_I64Code _ _
  516    = panic "assignReg_I64Code(i386): invalid lvalue"
  517 
  518 
  519 iselExpr64        :: CmmExpr -> NatM ChildCode64
  520 iselExpr64 (CmmLit (CmmInt i _)) = do
  521   (rlo,rhi) <- getNewRegPairNat II32
  522   let
  523         r = fromIntegral (fromIntegral i :: Word32)
  524         q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
  525         code = toOL [
  526                 MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
  527                 MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
  528                 ]
  529   return (ChildCode64 code rlo)
  530 
  531 iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
  532    Amode addr addr_code <- getAmode addrTree
  533    (rlo,rhi) <- getNewRegPairNat II32
  534    let
  535         mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
  536         mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
  537    return (
  538             ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
  539                         rlo
  540      )
  541 
  542 iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
  543    = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
  544 
  545 -- we handle addition, but rather badly
  546 iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
  547    ChildCode64 code1 r1lo <- iselExpr64 e1
  548    (rlo,rhi) <- getNewRegPairNat II32
  549    let
  550         r = fromIntegral (fromIntegral i :: Word32)
  551         q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
  552         r1hi = getHiVRegFromLo r1lo
  553         code =  code1 `appOL`
  554                 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
  555                        ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
  556                        MOV II32 (OpReg r1hi) (OpReg rhi),
  557                        ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
  558    return (ChildCode64 code rlo)
  559 
  560 iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
  561    ChildCode64 code1 r1lo <- iselExpr64 e1
  562    ChildCode64 code2 r2lo <- iselExpr64 e2
  563    (rlo,rhi) <- getNewRegPairNat II32
  564    let
  565         r1hi = getHiVRegFromLo r1lo
  566         r2hi = getHiVRegFromLo r2lo
  567         code =  code1 `appOL`
  568                 code2 `appOL`
  569                 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
  570                        ADD II32 (OpReg r2lo) (OpReg rlo),
  571                        MOV II32 (OpReg r1hi) (OpReg rhi),
  572                        ADC II32 (OpReg r2hi) (OpReg rhi) ]
  573    return (ChildCode64 code rlo)
  574 
  575 iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
  576    ChildCode64 code1 r1lo <- iselExpr64 e1
  577    ChildCode64 code2 r2lo <- iselExpr64 e2
  578    (rlo,rhi) <- getNewRegPairNat II32
  579    let
  580         r1hi = getHiVRegFromLo r1lo
  581         r2hi = getHiVRegFromLo r2lo
  582         code =  code1 `appOL`
  583                 code2 `appOL`
  584                 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
  585                        SUB II32 (OpReg r2lo) (OpReg rlo),
  586                        MOV II32 (OpReg r1hi) (OpReg rhi),
  587                        SBB II32 (OpReg r2hi) (OpReg rhi) ]
  588    return (ChildCode64 code rlo)
  589 
  590 iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
  591      fn <- getAnyReg expr
  592      r_dst_lo <-  getNewRegNat II32
  593      let r_dst_hi = getHiVRegFromLo r_dst_lo
  594          code = fn r_dst_lo
  595      return (
  596              ChildCode64 (code `snocOL`
  597                           MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
  598                           r_dst_lo
  599             )
  600 
  601 iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
  602      fn <- getAnyReg expr
  603      r_dst_lo <-  getNewRegNat II32
  604      let r_dst_hi = getHiVRegFromLo r_dst_lo
  605          code = fn r_dst_lo
  606      return (
  607              ChildCode64 (code `snocOL`
  608                           MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
  609                           CLTD II32 `snocOL`
  610                           MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
  611                           MOV II32 (OpReg edx) (OpReg r_dst_hi))
  612                           r_dst_lo
  613             )
  614 
  615 iselExpr64 expr
  616    = do
  617       platform <- getPlatform
  618       pprPanic "iselExpr64(i386)" (pdoc platform expr)
  619 
  620 
  621 --------------------------------------------------------------------------------
  622 getRegister :: CmmExpr -> NatM Register
  623 getRegister e = do platform <- getPlatform
  624                    is32Bit <- is32BitPlatform
  625                    getRegister' platform is32Bit e
  626 
  627 getRegister' :: Platform -> Bool -> CmmExpr -> NatM Register
  628 
  629 getRegister' platform is32Bit (CmmReg reg)
  630   = case reg of
  631         CmmGlobal PicBaseReg
  632          | is32Bit ->
  633             -- on x86_64, we have %rip for PicBaseReg, but it's not
  634             -- a full-featured register, it can only be used for
  635             -- rip-relative addressing.
  636             do reg' <- getPicBaseNat (archWordFormat is32Bit)
  637                return (Fixed (archWordFormat is32Bit) reg' nilOL)
  638         _ ->
  639             do
  640                let
  641                  fmt = cmmTypeFormat (cmmRegType platform reg)
  642                  format  = fmt
  643                --
  644                platform <- ncgPlatform <$> getConfig
  645                return (Fixed format
  646                              (getRegisterReg platform reg)
  647                              nilOL)
  648 
  649 
  650 getRegister' platform is32Bit (CmmRegOff r n)
  651   = getRegister' platform is32Bit $ mangleIndexTree platform r n
  652 
  653 getRegister' platform is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e])
  654   = addAlignmentCheck align <$> getRegister' platform is32Bit e
  655 
  656 -- for 32-bit architectures, support some 64 -> 32 bit conversions:
  657 -- TO_W_(x), TO_W_(x >> 32)
  658 
  659 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32)
  660                      [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
  661  | is32Bit = do
  662   ChildCode64 code rlo <- iselExpr64 x
  663   return $ Fixed II32 (getHiVRegFromLo rlo) code
  664 
  665 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32)
  666                      [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
  667  | is32Bit = do
  668   ChildCode64 code rlo <- iselExpr64 x
  669   return $ Fixed II32 (getHiVRegFromLo rlo) code
  670 
  671 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x])
  672  | is32Bit = do
  673   ChildCode64 code rlo <- iselExpr64 x
  674   return $ Fixed II32 rlo code
  675 
  676 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
  677  | is32Bit = do
  678   ChildCode64 code rlo <- iselExpr64 x
  679   return $ Fixed II32 rlo code
  680 
  681 getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
  682   float_const_sse2  where
  683   float_const_sse2
  684     | f == 0.0 = do
  685       let
  686           format = floatFormat w
  687           code dst = unitOL  (XOR format (OpReg dst) (OpReg dst))
  688         -- I don't know why there are xorpd, xorps, and pxor instructions.
  689         -- They all appear to do the same thing --SDM
  690       return (Any format code)
  691 
  692    | otherwise = do
  693       Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
  694       loadFloatAmode w addr code
  695 
  696 -- catch simple cases of zero- or sign-extended load
  697 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
  698   code <- intLoadCode (MOVZxL II8) addr
  699   return (Any II32 code)
  700 
  701 getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
  702   code <- intLoadCode (MOVSxL II8) addr
  703   return (Any II32 code)
  704 
  705 getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
  706   code <- intLoadCode (MOVZxL II16) addr
  707   return (Any II32 code)
  708 
  709 getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
  710   code <- intLoadCode (MOVSxL II16) addr
  711   return (Any II32 code)
  712 
  713 -- catch simple cases of zero- or sign-extended load
  714 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _])
  715  | not is32Bit = do
  716   code <- intLoadCode (MOVZxL II8) addr
  717   return (Any II64 code)
  718 
  719 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _])
  720  | not is32Bit = do
  721   code <- intLoadCode (MOVSxL II8) addr
  722   return (Any II64 code)
  723 
  724 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _])
  725  | not is32Bit = do
  726   code <- intLoadCode (MOVZxL II16) addr
  727   return (Any II64 code)
  728 
  729 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _])
  730  | not is32Bit = do
  731   code <- intLoadCode (MOVSxL II16) addr
  732   return (Any II64 code)
  733 
  734 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _])
  735  | not is32Bit = do
  736   code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
  737   return (Any II64 code)
  738 
  739 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _])
  740  | not is32Bit = do
  741   code <- intLoadCode (MOVSxL II32) addr
  742   return (Any II64 code)
  743 
  744 getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
  745                                      CmmLit displacement])
  746  | not is32Bit =
  747       return $ Any II64 (\dst -> unitOL $
  748         LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
  749 
  750 getRegister' platform is32Bit (CmmMachOp mop [x]) = -- unary MachOps
  751     case mop of
  752       MO_F_Neg w  -> sse2NegCode w x
  753 
  754 
  755       MO_S_Neg w -> triv_ucode NEGI (intFormat w)
  756       MO_Not w   -> triv_ucode NOT  (intFormat w)
  757 
  758       -- Nop conversions
  759       MO_UU_Conv W32 W8  -> toI8Reg  W32 x
  760       MO_SS_Conv W32 W8  -> toI8Reg  W32 x
  761       MO_XX_Conv W32 W8  -> toI8Reg  W32 x
  762       MO_UU_Conv W16 W8  -> toI8Reg  W16 x
  763       MO_SS_Conv W16 W8  -> toI8Reg  W16 x
  764       MO_XX_Conv W16 W8  -> toI8Reg  W16 x
  765       MO_UU_Conv W32 W16 -> toI16Reg W32 x
  766       MO_SS_Conv W32 W16 -> toI16Reg W32 x
  767       MO_XX_Conv W32 W16 -> toI16Reg W32 x
  768 
  769       MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
  770       MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
  771       MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
  772       MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
  773       MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
  774       MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
  775       MO_UU_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
  776       MO_SS_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
  777       MO_XX_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
  778 
  779       MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
  780       MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
  781       MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
  782 
  783       -- widenings
  784       MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
  785       MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
  786       MO_UU_Conv W8  W16 -> integerExtend W8  W16 MOVZxL x
  787 
  788       MO_SS_Conv W8  W32 -> integerExtend W8  W32 MOVSxL x
  789       MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
  790       MO_SS_Conv W8  W16 -> integerExtend W8  W16 MOVSxL x
  791 
  792       -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
  793       -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
  794       -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
  795       MO_XX_Conv W8  W32
  796           | is32Bit   -> integerExtend W8 W32 MOVZxL x
  797           | otherwise -> integerExtend W8 W32 MOV x
  798       MO_XX_Conv W8  W16
  799           | is32Bit   -> integerExtend W8 W16 MOVZxL x
  800           | otherwise -> integerExtend W8 W16 MOV x
  801       MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
  802 
  803       MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVZxL x
  804       MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
  805       MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
  806       MO_SS_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVSxL x
  807       MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
  808       MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
  809       -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
  810       -- However, we don't want the register allocator to throw it
  811       -- away as an unnecessary reg-to-reg move, so we keep it in
  812       -- the form of a movzl and print it as a movl later.
  813       -- This doesn't apply to MO_XX_Conv since in this case we don't care about
  814       -- the upper bits. So we can just use MOV.
  815       MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOV x
  816       MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
  817       MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
  818 
  819       MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
  820 
  821 
  822       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
  823 
  824       MO_FS_Conv from to -> coerceFP2Int from to x
  825       MO_SF_Conv from to -> coerceInt2FP from to x
  826 
  827       MO_V_Insert {}   -> needLlvm
  828       MO_V_Extract {}  -> needLlvm
  829       MO_V_Add {}      -> needLlvm
  830       MO_V_Sub {}      -> needLlvm
  831       MO_V_Mul {}      -> needLlvm
  832       MO_VS_Quot {}    -> needLlvm
  833       MO_VS_Rem {}     -> needLlvm
  834       MO_VS_Neg {}     -> needLlvm
  835       MO_VU_Quot {}    -> needLlvm
  836       MO_VU_Rem {}     -> needLlvm
  837       MO_VF_Insert {}  -> needLlvm
  838       MO_VF_Extract {} -> needLlvm
  839       MO_VF_Add {}     -> needLlvm
  840       MO_VF_Sub {}     -> needLlvm
  841       MO_VF_Mul {}     -> needLlvm
  842       MO_VF_Quot {}    -> needLlvm
  843       MO_VF_Neg {}     -> needLlvm
  844 
  845       _other -> pprPanic "getRegister" (pprMachOp mop)
  846    where
  847         triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register
  848         triv_ucode instr format = trivialUCode format (instr format) x
  849 
  850         -- signed or unsigned extension.
  851         integerExtend :: Width -> Width
  852                       -> (Format -> Operand -> Operand -> Instr)
  853                       -> CmmExpr -> NatM Register
  854         integerExtend from to instr expr = do
  855             (reg,e_code) <- if from == W8 then getByteReg expr
  856                                           else getSomeReg expr
  857             let
  858                 code dst =
  859                   e_code `snocOL`
  860                   instr (intFormat from) (OpReg reg) (OpReg dst)
  861             return (Any (intFormat to) code)
  862 
  863         toI8Reg :: Width -> CmmExpr -> NatM Register
  864         toI8Reg new_rep expr
  865             = do codefn <- getAnyReg expr
  866                  return (Any (intFormat new_rep) codefn)
  867                 -- HACK: use getAnyReg to get a byte-addressable register.
  868                 -- If the source was a Fixed register, this will add the
  869                 -- mov instruction to put it into the desired destination.
  870                 -- We're assuming that the destination won't be a fixed
  871                 -- non-byte-addressable register; it won't be, because all
  872                 -- fixed registers are word-sized.
  873 
  874         toI16Reg = toI8Reg -- for now
  875 
  876         conversionNop :: Format -> CmmExpr -> NatM Register
  877         conversionNop new_format expr
  878             = do e_code <- getRegister' platform is32Bit expr
  879                  return (swizzleRegisterRep e_code new_format)
  880 
  881 
  882 getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
  883   case mop of
  884       MO_F_Eq _ -> condFltReg is32Bit EQQ x y
  885       MO_F_Ne _ -> condFltReg is32Bit NE  x y
  886       MO_F_Gt _ -> condFltReg is32Bit GTT x y
  887       MO_F_Ge _ -> condFltReg is32Bit GE  x y
  888       -- Invert comparison condition and swap operands
  889       -- See Note [SSE Parity Checks]
  890       MO_F_Lt _ -> condFltReg is32Bit GTT  y x
  891       MO_F_Le _ -> condFltReg is32Bit GE   y x
  892 
  893       MO_Eq _   -> condIntReg EQQ x y
  894       MO_Ne _   -> condIntReg NE  x y
  895 
  896       MO_S_Gt _ -> condIntReg GTT x y
  897       MO_S_Ge _ -> condIntReg GE  x y
  898       MO_S_Lt _ -> condIntReg LTT x y
  899       MO_S_Le _ -> condIntReg LE  x y
  900 
  901       MO_U_Gt _ -> condIntReg GU  x y
  902       MO_U_Ge _ -> condIntReg GEU x y
  903       MO_U_Lt _ -> condIntReg LU  x y
  904       MO_U_Le _ -> condIntReg LEU x y
  905 
  906       MO_F_Add w   -> trivialFCode_sse2 w ADD  x y
  907 
  908       MO_F_Sub w   -> trivialFCode_sse2 w SUB  x y
  909 
  910       MO_F_Quot w  -> trivialFCode_sse2 w FDIV x y
  911 
  912       MO_F_Mul w   -> trivialFCode_sse2 w MUL x y
  913 
  914 
  915       MO_Add rep -> add_code rep x y
  916       MO_Sub rep -> sub_code rep x y
  917 
  918       MO_S_Quot rep -> div_code rep True  True  x y
  919       MO_S_Rem  rep -> div_code rep True  False x y
  920       MO_U_Quot rep -> div_code rep False True  x y
  921       MO_U_Rem  rep -> div_code rep False False x y
  922 
  923       MO_S_MulMayOflo rep -> imulMayOflo rep x y
  924 
  925       MO_Mul W8  -> imulW8 x y
  926       MO_Mul rep -> triv_op rep IMUL
  927       MO_And rep -> triv_op rep AND
  928       MO_Or  rep -> triv_op rep OR
  929       MO_Xor rep -> triv_op rep XOR
  930 
  931         {- Shift ops on x86s have constraints on their source, it
  932            either has to be Imm, CL or 1
  933             => trivialCode is not restrictive enough (sigh.)
  934         -}
  935       MO_Shl rep   -> shift_code rep SHL x y {-False-}
  936       MO_U_Shr rep -> shift_code rep SHR x y {-False-}
  937       MO_S_Shr rep -> shift_code rep SAR x y {-False-}
  938 
  939       MO_V_Insert {}   -> needLlvm
  940       MO_V_Extract {}  -> needLlvm
  941       MO_V_Add {}      -> needLlvm
  942       MO_V_Sub {}      -> needLlvm
  943       MO_V_Mul {}      -> needLlvm
  944       MO_VS_Quot {}    -> needLlvm
  945       MO_VS_Rem {}     -> needLlvm
  946       MO_VS_Neg {}     -> needLlvm
  947       MO_VF_Insert {}  -> needLlvm
  948       MO_VF_Extract {} -> needLlvm
  949       MO_VF_Add {}     -> needLlvm
  950       MO_VF_Sub {}     -> needLlvm
  951       MO_VF_Mul {}     -> needLlvm
  952       MO_VF_Quot {}    -> needLlvm
  953       MO_VF_Neg {}     -> needLlvm
  954 
  955       _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
  956   where
  957     --------------------
  958     triv_op width instr = trivialCode width op (Just op) x y
  959                         where op   = instr (intFormat width)
  960 
  961     -- Special case for IMUL for bytes, since the result of IMULB will be in
  962     -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
  963     -- values.
  964     imulW8 :: CmmExpr -> CmmExpr -> NatM Register
  965     imulW8 arg_a arg_b = do
  966         (a_reg, a_code) <- getNonClobberedReg arg_a
  967         b_code <- getAnyReg arg_b
  968 
  969         let code = a_code `appOL` b_code eax `appOL`
  970                    toOL [ IMUL2 format (OpReg a_reg) ]
  971             format = intFormat W8
  972 
  973         return (Fixed format eax code)
  974 
  975 
  976     imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
  977     imulMayOflo rep a b = do
  978          (a_reg, a_code) <- getNonClobberedReg a
  979          b_code <- getAnyReg b
  980          let
  981              shift_amt  = case rep of
  982                            W32 -> 31
  983                            W64 -> 63
  984                            _ -> panic "shift_amt"
  985 
  986              format = intFormat rep
  987              code = a_code `appOL` b_code eax `appOL`
  988                         toOL [
  989                            IMUL2 format (OpReg a_reg),   -- result in %edx:%eax
  990                            SAR format (OpImm (ImmInt shift_amt)) (OpReg eax),
  991                                 -- sign extend lower part
  992                            SUB format (OpReg edx) (OpReg eax)
  993                                 -- compare against upper
  994                            -- eax==0 if high part == sign extended low part
  995                         ]
  996          return (Fixed format eax code)
  997 
  998     --------------------
  999     shift_code :: Width
 1000                -> (Format -> Operand -> Operand -> Instr)
 1001                -> CmmExpr
 1002                -> CmmExpr
 1003                -> NatM Register
 1004 
 1005     {- Case1: shift length as immediate -}
 1006     shift_code width instr x (CmmLit lit) = do
 1007           x_code <- getAnyReg x
 1008           let
 1009                format = intFormat width
 1010                code dst
 1011                   = x_code dst `snocOL`
 1012                     instr format (OpImm (litToImm lit)) (OpReg dst)
 1013           return (Any format code)
 1014 
 1015     {- Case2: shift length is complex (non-immediate)
 1016       * y must go in %ecx.
 1017       * we cannot do y first *and* put its result in %ecx, because
 1018         %ecx might be clobbered by x.
 1019       * if we do y second, then x cannot be
 1020         in a clobbered reg.  Also, we cannot clobber x's reg
 1021         with the instruction itself.
 1022       * so we can either:
 1023         - do y first, put its result in a fresh tmp, then copy it to %ecx later
 1024         - do y second and put its result into %ecx.  x gets placed in a fresh
 1025           tmp.  This is likely to be better, because the reg alloc can
 1026           eliminate this reg->reg move here (it won't eliminate the other one,
 1027           because the move is into the fixed %ecx).
 1028       * in the case of C calls the use of ecx here can interfere with arguments.
 1029         We avoid this with the hack described in Note [Evaluate C-call
 1030         arguments before placing in destination registers]
 1031     -}
 1032     shift_code width instr x y{-amount-} = do
 1033         x_code <- getAnyReg x
 1034         let format = intFormat width
 1035         tmp <- getNewRegNat format
 1036         y_code <- getAnyReg y
 1037         let
 1038            code = x_code tmp `appOL`
 1039                   y_code ecx `snocOL`
 1040                   instr format (OpReg ecx) (OpReg tmp)
 1041         return (Fixed format tmp code)
 1042 
 1043     --------------------
 1044     add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
 1045     add_code rep x (CmmLit (CmmInt y _))
 1046         | is32BitInteger y
 1047         , rep /= W8 -- LEA doesn't support byte size (#18614)
 1048         = add_int rep x y
 1049     add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y
 1050       where format = intFormat rep
 1051     -- TODO: There are other interesting patterns we want to replace
 1052     --     with a LEA, e.g. `(x + offset) + (y << shift)`.
 1053 
 1054     --------------------
 1055     sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
 1056     sub_code rep x (CmmLit (CmmInt y _))
 1057         | is32BitInteger (-y)
 1058         , rep /= W8 -- LEA doesn't support byte size (#18614)
 1059         = add_int rep x (-y)
 1060     sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y
 1061 
 1062     -- our three-operand add instruction:
 1063     add_int width x y = do
 1064         (x_reg, x_code) <- getSomeReg x
 1065         let
 1066             format = intFormat width
 1067             imm = ImmInt (fromInteger y)
 1068             code dst
 1069                = x_code `snocOL`
 1070                  LEA format
 1071                         (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
 1072                         (OpReg dst)
 1073         --
 1074         return (Any format code)
 1075 
 1076     ----------------------
 1077 
 1078     -- See Note [DIV/IDIV for bytes]
 1079     div_code W8 signed quotient x y = do
 1080         let widen | signed    = MO_SS_Conv W8 W16
 1081                   | otherwise = MO_UU_Conv W8 W16
 1082         div_code
 1083             W16
 1084             signed
 1085             quotient
 1086             (CmmMachOp widen [x])
 1087             (CmmMachOp widen [y])
 1088 
 1089     div_code width signed quotient x y = do
 1090            (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
 1091            x_code <- getAnyReg x
 1092            let
 1093              format = intFormat width
 1094              widen | signed    = CLTD format
 1095                    | otherwise = XOR format (OpReg edx) (OpReg edx)
 1096 
 1097              instr | signed    = IDIV
 1098                    | otherwise = DIV
 1099 
 1100              code = y_code `appOL`
 1101                     x_code eax `appOL`
 1102                     toOL [widen, instr format y_op]
 1103 
 1104              result | quotient  = eax
 1105                     | otherwise = edx
 1106 
 1107            return (Fixed format result code)
 1108 
 1109 
 1110 getRegister' _ _ (CmmLoad mem pk)
 1111   | isFloatType pk
 1112   = do
 1113     Amode addr mem_code <- getAmode mem
 1114     loadFloatAmode  (typeWidth pk) addr mem_code
 1115 
 1116 getRegister' _ is32Bit (CmmLoad mem pk)
 1117   | is32Bit && not (isWord64 pk)
 1118   = do
 1119     code <- intLoadCode instr mem
 1120     return (Any format code)
 1121   where
 1122     width = typeWidth pk
 1123     format = intFormat width
 1124     instr = case width of
 1125                 W8     -> MOVZxL II8
 1126                 _other -> MOV format
 1127         -- We always zero-extend 8-bit loads, if we
 1128         -- can't think of anything better.  This is because
 1129         -- we can't guarantee access to an 8-bit variant of every register
 1130         -- (esi and edi don't have 8-bit variants), so to make things
 1131         -- simpler we do our 8-bit arithmetic with full 32-bit registers.
 1132 
 1133 -- Simpler memory load code on x86_64
 1134 getRegister' _ is32Bit (CmmLoad mem pk)
 1135  | not is32Bit
 1136   = do
 1137     code <- intLoadCode (MOV format) mem
 1138     return (Any format code)
 1139   where format = intFormat $ typeWidth pk
 1140 
 1141 getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
 1142   = let
 1143         format = intFormat width
 1144 
 1145         -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
 1146         format1 = if is32Bit then format
 1147                            else case format of
 1148                                 II64 -> II32
 1149                                 _ -> format
 1150         code dst
 1151            = unitOL (XOR format1 (OpReg dst) (OpReg dst))
 1152     in
 1153         return (Any format code)
 1154 
 1155   -- optimisation for loading small literals on x86_64: take advantage
 1156   -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
 1157   -- instruction forms are shorter.
 1158 getRegister' platform is32Bit (CmmLit lit)
 1159   | not is32Bit, isWord64 (cmmLitType platform lit), not (isBigLit lit)
 1160   = let
 1161         imm = litToImm lit
 1162         code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
 1163     in
 1164         return (Any II64 code)
 1165   where
 1166    isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
 1167    isBigLit _ = False
 1168         -- note1: not the same as (not.is32BitLit), because that checks for
 1169         -- signed literals that fit in 32 bits, but we want unsigned
 1170         -- literals here.
 1171         -- note2: all labels are small, because we're assuming the
 1172         -- small memory model (see gcc docs, -mcmodel=small).
 1173 
 1174 getRegister' platform _ (CmmLit lit)
 1175   = do let format = cmmTypeFormat (cmmLitType platform lit)
 1176            imm = litToImm lit
 1177            code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
 1178        return (Any format code)
 1179 
 1180 getRegister' platform _ other
 1181     | isVecExpr other  = needLlvm
 1182     | otherwise        = pprPanic "getRegister(x86)" (pdoc platform other)
 1183 
 1184 
 1185 intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
 1186    -> NatM (Reg -> InstrBlock)
 1187 intLoadCode instr mem = do
 1188   Amode src mem_code <- getAmode mem
 1189   return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
 1190 
 1191 -- Compute an expression into *any* register, adding the appropriate
 1192 -- move instruction if necessary.
 1193 getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
 1194 getAnyReg expr = do
 1195   r <- getRegister expr
 1196   anyReg r
 1197 
 1198 anyReg :: Register -> NatM (Reg -> InstrBlock)
 1199 anyReg (Any _ code)          = return code
 1200 anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
 1201 
 1202 -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
 1203 -- Fixed registers might not be byte-addressable, so we make sure we've
 1204 -- got a temporary, inserting an extra reg copy if necessary.
 1205 getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
 1206 getByteReg expr = do
 1207   is32Bit <- is32BitPlatform
 1208   if is32Bit
 1209       then do r <- getRegister expr
 1210               case r of
 1211                 Any rep code -> do
 1212                     tmp <- getNewRegNat rep
 1213                     return (tmp, code tmp)
 1214                 Fixed rep reg code
 1215                     | isVirtualReg reg -> return (reg,code)
 1216                     | otherwise -> do
 1217                         tmp <- getNewRegNat rep
 1218                         return (tmp, code `snocOL` reg2reg rep reg tmp)
 1219                     -- ToDo: could optimise slightly by checking for
 1220                     -- byte-addressable real registers, but that will
 1221                     -- happen very rarely if at all.
 1222       else getSomeReg expr -- all regs are byte-addressable on x86_64
 1223 
 1224 -- Another variant: this time we want the result in a register that cannot
 1225 -- be modified by code to evaluate an arbitrary expression.
 1226 getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
 1227 getNonClobberedReg expr = do
 1228   r <- getRegister expr
 1229   platform <- ncgPlatform <$> getConfig
 1230   case r of
 1231     Any rep code -> do
 1232         tmp <- getNewRegNat rep
 1233         return (tmp, code tmp)
 1234     Fixed rep reg code
 1235         -- only certain regs can be clobbered
 1236         | reg `elem` instrClobberedRegs platform
 1237         -> do
 1238                 tmp <- getNewRegNat rep
 1239                 return (tmp, code `snocOL` reg2reg rep reg tmp)
 1240         | otherwise ->
 1241                 return (reg, code)
 1242 
 1243 reg2reg :: Format -> Reg -> Reg -> Instr
 1244 reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
 1245 
 1246 
 1247 --------------------------------------------------------------------------------
 1248 
 1249 -- | Convert a 'CmmExpr' representing a memory address into an 'Amode'.
 1250 --
 1251 -- An 'Amode' is a datatype representing a valid address form for the target
 1252 -- (e.g. "Base + Index + disp" or immediate) and the code to compute it.
 1253 getAmode :: CmmExpr -> NatM Amode
 1254 getAmode e = do
 1255    platform <- getPlatform
 1256    let is32Bit = target32Bit platform
 1257 
 1258    case e of
 1259       CmmRegOff r n
 1260          -> getAmode $ mangleIndexTree platform r n
 1261 
 1262       CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg), CmmLit displacement]
 1263          | not is32Bit
 1264          -> return $ Amode (ripRel (litToImm displacement)) nilOL
 1265 
 1266       -- This is all just ridiculous, since it carefully undoes
 1267       -- what mangleIndexTree has just done.
 1268       CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)]
 1269          | is32BitLit is32Bit lit
 1270          -- assert (rep == II32)???
 1271          -> do
 1272             (x_reg, x_code) <- getSomeReg x
 1273             let off = ImmInt (-(fromInteger i))
 1274             return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
 1275 
 1276       CmmMachOp (MO_Add _rep) [x, CmmLit lit]
 1277          | is32BitLit is32Bit lit
 1278          -- assert (rep == II32)???
 1279          -> do
 1280             (x_reg, x_code) <- getSomeReg x
 1281             let off = litToImm lit
 1282             return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
 1283 
 1284       -- Turn (lit1 << n  + lit2) into  (lit2 + lit1 << n) so it will be
 1285       -- recognised by the next rule.
 1286       CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _), b@(CmmLit _)]
 1287          -> getAmode (CmmMachOp (MO_Add rep) [b,a])
 1288 
 1289       -- Matches: (x + offset) + (y << shift)
 1290       CmmMachOp (MO_Add _) [CmmRegOff x offset, CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)]]
 1291          | shift == 0 || shift == 1 || shift == 2 || shift == 3
 1292          -> x86_complex_amode (CmmReg x) y shift (fromIntegral offset)
 1293 
 1294       CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)]]
 1295          | shift == 0 || shift == 1 || shift == 2 || shift == 3
 1296          -> x86_complex_amode x y shift 0
 1297 
 1298       CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Add _) [CmmMachOp (MO_Shl _)
 1299                                                     [y, CmmLit (CmmInt shift _)], CmmLit (CmmInt offset _)]]
 1300          | shift == 0 || shift == 1 || shift == 2 || shift == 3
 1301          && is32BitInteger offset
 1302          -> x86_complex_amode x y shift offset
 1303 
 1304       CmmMachOp (MO_Add _) [x,y]
 1305          | not (isLit y) -- we already handle valid literals above.
 1306          -> x86_complex_amode x y 0 0
 1307 
 1308       CmmLit lit
 1309          | is32BitLit is32Bit lit
 1310          -> return (Amode (ImmAddr (litToImm lit) 0) nilOL)
 1311 
 1312       -- Literal with offsets too big (> 32 bits) fails during the linking phase
 1313       -- (#15570). We already handled valid literals above so we don't have to
 1314       -- test anything here.
 1315       CmmLit (CmmLabelOff l off)
 1316          -> getAmode (CmmMachOp (MO_Add W64) [ CmmLit (CmmLabel l)
 1317                                              , CmmLit (CmmInt (fromIntegral off) W64)
 1318                                              ])
 1319       CmmLit (CmmLabelDiffOff l1 l2 off w)
 1320          -> getAmode (CmmMachOp (MO_Add W64) [ CmmLit (CmmLabelDiffOff l1 l2 0 w)
 1321                                              , CmmLit (CmmInt (fromIntegral off) W64)
 1322                                              ])
 1323 
 1324       -- in case we can't do something better, we just compute the expression
 1325       -- and put the result in a register
 1326       _ -> do
 1327         (reg,code) <- getSomeReg e
 1328         return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
 1329 
 1330 
 1331 
 1332 -- | Like 'getAmode', but on 32-bit use simple register addressing
 1333 -- (i.e. no index register). This stops us from running out of
 1334 -- registers on x86 when using instructions such as cmpxchg, which can
 1335 -- use up to three virtual registers and one fixed register.
 1336 getSimpleAmode :: Bool -> CmmExpr -> NatM Amode
 1337 getSimpleAmode is32Bit addr
 1338     | is32Bit = do
 1339         addr_code <- getAnyReg addr
 1340         config <- getConfig
 1341         addr_r <- getNewRegNat (intFormat (ncgWordWidth config))
 1342         let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0)
 1343         return $! Amode amode (addr_code addr_r)
 1344     | otherwise = getAmode addr
 1345 
 1346 x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
 1347 x86_complex_amode base index shift offset
 1348   = do (x_reg, x_code) <- getNonClobberedReg base
 1349         -- x must be in a temp, because it has to stay live over y_code
 1350         -- we could compare x_reg and y_reg and do something better here...
 1351        (y_reg, y_code) <- getSomeReg index
 1352        let
 1353            code = x_code `appOL` y_code
 1354            base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
 1355                                 n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
 1356        return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
 1357                code)
 1358 
 1359 
 1360 
 1361 
 1362 -- -----------------------------------------------------------------------------
 1363 -- getOperand: sometimes any operand will do.
 1364 
 1365 -- getNonClobberedOperand: the value of the operand will remain valid across
 1366 -- the computation of an arbitrary expression, unless the expression
 1367 -- is computed directly into a register which the operand refers to
 1368 -- (see trivialCode where this function is used for an example).
 1369 
 1370 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
 1371 getNonClobberedOperand (CmmLit lit) =
 1372   if isSuitableFloatingPointLit lit
 1373   then do
 1374     let CmmFloat _ w = lit
 1375     Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
 1376     return (OpAddr addr, code)
 1377   else do
 1378     is32Bit <- is32BitPlatform
 1379     platform <- getPlatform
 1380     if is32BitLit is32Bit lit && not (isFloatType (cmmLitType platform lit))
 1381     then return (OpImm (litToImm lit), nilOL)
 1382     else getNonClobberedOperand_generic (CmmLit lit)
 1383 
 1384 getNonClobberedOperand (CmmLoad mem pk) = do
 1385   is32Bit <- is32BitPlatform
 1386   -- this logic could be simplified
 1387   -- TODO FIXME
 1388   if   (if is32Bit then not (isWord64 pk) else True)
 1389       -- if 32bit and pk is at float/double/simd value
 1390       -- or if 64bit
 1391       --  this could use some eyeballs or i'll need to stare at it more later
 1392     then do
 1393       platform <- ncgPlatform <$> getConfig
 1394       Amode src mem_code <- getAmode mem
 1395       (src',save_code) <-
 1396         if (amodeCouldBeClobbered platform src)
 1397                 then do
 1398                    tmp <- getNewRegNat (archWordFormat is32Bit)
 1399                    return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
 1400                            unitOL (LEA (archWordFormat is32Bit)
 1401                                        (OpAddr src)
 1402                                        (OpReg tmp)))
 1403                 else
 1404                    return (src, nilOL)
 1405       return (OpAddr src', mem_code `appOL` save_code)
 1406     else
 1407       -- if its a word or gcptr on 32bit?
 1408       getNonClobberedOperand_generic (CmmLoad mem pk)
 1409 
 1410 getNonClobberedOperand e = getNonClobberedOperand_generic e
 1411 
 1412 getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
 1413 getNonClobberedOperand_generic e = do
 1414   (reg, code) <- getNonClobberedReg e
 1415   return (OpReg reg, code)
 1416 
 1417 amodeCouldBeClobbered :: Platform -> AddrMode -> Bool
 1418 amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode)
 1419 
 1420 regClobbered :: Platform -> Reg -> Bool
 1421 regClobbered platform (RegReal (RealRegSingle rr)) = freeReg platform rr
 1422 regClobbered _ _ = False
 1423 
 1424 -- getOperand: the operand is not required to remain valid across the
 1425 -- computation of an arbitrary expression.
 1426 getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
 1427 
 1428 getOperand (CmmLit lit) = do
 1429   use_sse2 <- sse2Enabled
 1430   if (use_sse2 && isSuitableFloatingPointLit lit)
 1431     then do
 1432       let CmmFloat _ w = lit
 1433       Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
 1434       return (OpAddr addr, code)
 1435     else do
 1436 
 1437   is32Bit <- is32BitPlatform
 1438   platform <- getPlatform
 1439   if is32BitLit is32Bit lit && not (isFloatType (cmmLitType platform lit))
 1440     then return (OpImm (litToImm lit), nilOL)
 1441     else getOperand_generic (CmmLit lit)
 1442 
 1443 getOperand (CmmLoad mem pk) = do
 1444   is32Bit <- is32BitPlatform
 1445   use_sse2 <- sse2Enabled
 1446   if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
 1447      then do
 1448        Amode src mem_code <- getAmode mem
 1449        return (OpAddr src, mem_code)
 1450      else
 1451        getOperand_generic (CmmLoad mem pk)
 1452 
 1453 getOperand e = getOperand_generic e
 1454 
 1455 getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
 1456 getOperand_generic e = do
 1457     (reg, code) <- getSomeReg e
 1458     return (OpReg reg, code)
 1459 
 1460 isOperand :: Bool -> CmmExpr -> Bool
 1461 isOperand _ (CmmLoad _ _) = True
 1462 isOperand is32Bit (CmmLit lit)  = is32BitLit is32Bit lit
 1463                           || isSuitableFloatingPointLit lit
 1464 isOperand _ _            = False
 1465 
 1466 -- | Given a 'Register', produce a new 'Register' with an instruction block
 1467 -- which will check the value for alignment. Used for @-falignment-sanitisation@.
 1468 addAlignmentCheck :: Int -> Register -> Register
 1469 addAlignmentCheck align reg =
 1470     case reg of
 1471       Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg)
 1472       Any fmt f          -> Any fmt (\reg -> f reg `appOL` check fmt reg)
 1473   where
 1474     check :: Format -> Reg -> InstrBlock
 1475     check fmt reg =
 1476         assert (not $ isFloatFormat fmt) $
 1477         toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
 1478              , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
 1479              ]
 1480 
 1481 memConstant :: Alignment -> CmmLit -> NatM Amode
 1482 memConstant align lit = do
 1483   lbl <- getNewLabelNat
 1484   let rosection = Section ReadOnlyData lbl
 1485   config <- getConfig
 1486   platform <- getPlatform
 1487   (addr, addr_code) <- if target32Bit platform
 1488                        then do dynRef <- cmmMakeDynamicReference
 1489                                              config
 1490                                              DataReference
 1491                                              lbl
 1492                                Amode addr addr_code <- getAmode dynRef
 1493                                return (addr, addr_code)
 1494                        else return (ripRel (ImmCLbl lbl), nilOL)
 1495   let code =
 1496         LDATA rosection (align, CmmStaticsRaw lbl [CmmStaticLit lit])
 1497         `consOL` addr_code
 1498   return (Amode addr code)
 1499 
 1500 
 1501 loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
 1502 loadFloatAmode w addr addr_code = do
 1503   let format = floatFormat w
 1504       code dst = addr_code `snocOL`
 1505                     MOV format (OpAddr addr) (OpReg dst)
 1506 
 1507   return (Any format code)
 1508 
 1509 
 1510 -- if we want a floating-point literal as an operand, we can
 1511 -- use it directly from memory.  However, if the literal is
 1512 -- zero, we're better off generating it into a register using
 1513 -- xor.
 1514 isSuitableFloatingPointLit :: CmmLit -> Bool
 1515 isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
 1516 isSuitableFloatingPointLit _ = False
 1517 
 1518 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
 1519 getRegOrMem e@(CmmLoad mem pk) = do
 1520   is32Bit <- is32BitPlatform
 1521   use_sse2 <- sse2Enabled
 1522   if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
 1523      then do
 1524        Amode src mem_code <- getAmode mem
 1525        return (OpAddr src, mem_code)
 1526      else do
 1527        (reg, code) <- getNonClobberedReg e
 1528        return (OpReg reg, code)
 1529 getRegOrMem e = do
 1530     (reg, code) <- getNonClobberedReg e
 1531     return (OpReg reg, code)
 1532 
 1533 is32BitLit :: Bool -> CmmLit -> Bool
 1534 is32BitLit is32Bit lit
 1535    | not is32Bit = case lit of
 1536       CmmInt i W64              -> is32BitInteger i
 1537       -- assume that labels are in the range 0-2^31-1: this assumes the
 1538       -- small memory model (see gcc docs, -mcmodel=small).
 1539       CmmLabel _                -> True
 1540       -- however we can't assume that label offsets are in this range
 1541       -- (see #15570)
 1542       CmmLabelOff _ off         -> is32BitInteger (fromIntegral off)
 1543       CmmLabelDiffOff _ _ off _ -> is32BitInteger (fromIntegral off)
 1544       _                         -> True
 1545 is32BitLit _ _ = True
 1546 
 1547 
 1548 
 1549 
 1550 -- Set up a condition code for a conditional branch.
 1551 
 1552 getCondCode :: CmmExpr -> NatM CondCode
 1553 
 1554 -- yes, they really do seem to want exactly the same!
 1555 
 1556 getCondCode (CmmMachOp mop [x, y])
 1557   =
 1558     case mop of
 1559       MO_F_Eq W32 -> condFltCode EQQ x y
 1560       MO_F_Ne W32 -> condFltCode NE  x y
 1561       MO_F_Gt W32 -> condFltCode GTT x y
 1562       MO_F_Ge W32 -> condFltCode GE  x y
 1563       -- Invert comparison condition and swap operands
 1564       -- See Note [SSE Parity Checks]
 1565       MO_F_Lt W32 -> condFltCode GTT  y x
 1566       MO_F_Le W32 -> condFltCode GE   y x
 1567 
 1568       MO_F_Eq W64 -> condFltCode EQQ x y
 1569       MO_F_Ne W64 -> condFltCode NE  x y
 1570       MO_F_Gt W64 -> condFltCode GTT x y
 1571       MO_F_Ge W64 -> condFltCode GE  x y
 1572       MO_F_Lt W64 -> condFltCode GTT y x
 1573       MO_F_Le W64 -> condFltCode GE  y x
 1574 
 1575       _ -> condIntCode (machOpToCond mop) x y
 1576 
 1577 getCondCode other = do
 1578    platform <- getPlatform
 1579    pprPanic "getCondCode(2)(x86,x86_64)" (pdoc platform other)
 1580 
 1581 machOpToCond :: MachOp -> Cond
 1582 machOpToCond mo = case mo of
 1583   MO_Eq _   -> EQQ
 1584   MO_Ne _   -> NE
 1585   MO_S_Gt _ -> GTT
 1586   MO_S_Ge _ -> GE
 1587   MO_S_Lt _ -> LTT
 1588   MO_S_Le _ -> LE
 1589   MO_U_Gt _ -> GU
 1590   MO_U_Ge _ -> GEU
 1591   MO_U_Lt _ -> LU
 1592   MO_U_Le _ -> LEU
 1593   _other -> pprPanic "machOpToCond" (pprMachOp mo)
 1594 
 1595 
 1596 -- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
 1597 -- passed back up the tree.
 1598 
 1599 condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
 1600 condIntCode cond x y = do is32Bit <- is32BitPlatform
 1601                           condIntCode' is32Bit cond x y
 1602 
 1603 condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
 1604 
 1605 -- memory vs immediate
 1606 condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit)
 1607  | is32BitLit is32Bit lit = do
 1608     Amode x_addr x_code <- getAmode x
 1609     let
 1610         imm  = litToImm lit
 1611         code = x_code `snocOL`
 1612                   CMP (cmmTypeFormat pk) (OpImm imm) (OpAddr x_addr)
 1613     --
 1614     return (CondCode False cond code)
 1615 
 1616 -- anything vs zero, using a mask
 1617 -- TODO: Add some sanity checking!!!!
 1618 condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
 1619     | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit
 1620     = do
 1621       (x_reg, x_code) <- getSomeReg x
 1622       let
 1623          code = x_code `snocOL`
 1624                 TEST (intFormat pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
 1625       --
 1626       return (CondCode False cond code)
 1627 
 1628 -- anything vs zero
 1629 condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do
 1630     (x_reg, x_code) <- getSomeReg x
 1631     let
 1632         code = x_code `snocOL`
 1633                   TEST (intFormat pk) (OpReg x_reg) (OpReg x_reg)
 1634     --
 1635     return (CondCode False cond code)
 1636 
 1637 -- anything vs operand
 1638 condIntCode' is32Bit cond x y
 1639  | isOperand is32Bit y = do
 1640     platform <- getPlatform
 1641     (x_reg, x_code) <- getNonClobberedReg x
 1642     (y_op,  y_code) <- getOperand y
 1643     let
 1644         code = x_code `appOL` y_code `snocOL`
 1645                   CMP (cmmTypeFormat (cmmExprType platform x)) y_op (OpReg x_reg)
 1646     return (CondCode False cond code)
 1647 -- operand vs. anything: invert the comparison so that we can use a
 1648 -- single comparison instruction.
 1649  | isOperand is32Bit x
 1650  , Just revcond <- maybeFlipCond cond = do
 1651     platform <- getPlatform
 1652     (y_reg, y_code) <- getNonClobberedReg y
 1653     (x_op,  x_code) <- getOperand x
 1654     let
 1655         code = y_code `appOL` x_code `snocOL`
 1656                   CMP (cmmTypeFormat (cmmExprType platform x)) x_op (OpReg y_reg)
 1657     return (CondCode False revcond code)
 1658 
 1659 -- anything vs anything
 1660 condIntCode' _ cond x y = do
 1661   platform <- getPlatform
 1662   (y_reg, y_code) <- getNonClobberedReg y
 1663   (x_op, x_code) <- getRegOrMem x
 1664   let
 1665         code = y_code `appOL`
 1666                x_code `snocOL`
 1667                   CMP (cmmTypeFormat (cmmExprType platform x)) (OpReg y_reg) x_op
 1668   return (CondCode False cond code)
 1669 
 1670 
 1671 
 1672 --------------------------------------------------------------------------------
 1673 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
 1674 
 1675 condFltCode cond x y
 1676   =  condFltCode_sse2
 1677   where
 1678 
 1679 
 1680   -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
 1681   -- an operand, but the right must be a reg.  We can probably do better
 1682   -- than this general case...
 1683   condFltCode_sse2 = do
 1684     platform <- getPlatform
 1685     (x_reg, x_code) <- getNonClobberedReg x
 1686     (y_op, y_code) <- getOperand y
 1687     let
 1688         code = x_code `appOL`
 1689                y_code `snocOL`
 1690                   CMP (floatFormat $ cmmExprWidth platform x) y_op (OpReg x_reg)
 1691         -- NB(1): we need to use the unsigned comparison operators on the
 1692         -- result of this comparison.
 1693     return (CondCode True (condToUnsigned cond) code)
 1694 
 1695 -- -----------------------------------------------------------------------------
 1696 -- Generating assignments
 1697 
 1698 -- Assignments are really at the heart of the whole code generation
 1699 -- business.  Almost all top-level nodes of any real importance are
 1700 -- assignments, which correspond to loads, stores, or register
 1701 -- transfers.  If we're really lucky, some of the register transfers
 1702 -- will go away, because we can use the destination register to
 1703 -- complete the code generation for the right hand side.  This only
 1704 -- fails when the right hand side is forced into a fixed register
 1705 -- (e.g. the result of a call).
 1706 
 1707 assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
 1708 assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
 1709 
 1710 assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
 1711 assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
 1712 
 1713 
 1714 -- integer assignment to memory
 1715 
 1716 -- specific case of adding/subtracting an integer to a particular address.
 1717 -- ToDo: catch other cases where we can use an operation directly on a memory
 1718 -- address.
 1719 assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
 1720                                                  CmmLit (CmmInt i _)])
 1721    | addr == addr2, pk /= II64 || is32BitInteger i,
 1722      Just instr <- check op
 1723    = do Amode amode code_addr <- getAmode addr
 1724         let code = code_addr `snocOL`
 1725                    instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
 1726         return code
 1727    where
 1728         check (MO_Add _) = Just ADD
 1729         check (MO_Sub _) = Just SUB
 1730         check _ = Nothing
 1731         -- ToDo: more?
 1732 
 1733 -- general case
 1734 assignMem_IntCode pk addr src = do
 1735     is32Bit <- is32BitPlatform
 1736     Amode addr code_addr <- getAmode addr
 1737     (code_src, op_src)   <- get_op_RI is32Bit src
 1738     let
 1739         code = code_src `appOL`
 1740                code_addr `snocOL`
 1741                   MOV pk op_src (OpAddr addr)
 1742         -- NOTE: op_src is stable, so it will still be valid
 1743         -- after code_addr.  This may involve the introduction
 1744         -- of an extra MOV to a temporary register, but we hope
 1745         -- the register allocator will get rid of it.
 1746     --
 1747     return code
 1748   where
 1749     get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand)   -- code, operator
 1750     get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit
 1751       = return (nilOL, OpImm (litToImm lit))
 1752     get_op_RI _ op
 1753       = do (reg,code) <- getNonClobberedReg op
 1754            return (code, OpReg reg)
 1755 
 1756 
 1757 -- Assign; dst is a reg, rhs is mem
 1758 assignReg_IntCode pk reg (CmmLoad src _) = do
 1759   load_code <- intLoadCode (MOV pk) src
 1760   platform <- ncgPlatform <$> getConfig
 1761   return (load_code (getRegisterReg platform reg))
 1762 
 1763 -- dst is a reg, but src could be anything
 1764 assignReg_IntCode _ reg src = do
 1765   platform <- ncgPlatform <$> getConfig
 1766   code <- getAnyReg src
 1767   return (code (getRegisterReg platform reg))
 1768 
 1769 
 1770 -- Floating point assignment to memory
 1771 assignMem_FltCode pk addr src = do
 1772   (src_reg, src_code) <- getNonClobberedReg src
 1773   Amode addr addr_code <- getAmode addr
 1774   let
 1775         code = src_code `appOL`
 1776                addr_code `snocOL`
 1777                MOV pk (OpReg src_reg) (OpAddr addr)
 1778 
 1779   return code
 1780 
 1781 -- Floating point assignment to a register/temporary
 1782 assignReg_FltCode _ reg src = do
 1783   src_code <- getAnyReg src
 1784   platform <- ncgPlatform <$> getConfig
 1785   return (src_code (getRegisterReg platform reg))
 1786 
 1787 
 1788 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
 1789 
 1790 genJump (CmmLoad mem _) regs = do
 1791   Amode target code <- getAmode mem
 1792   return (code `snocOL` JMP (OpAddr target) regs)
 1793 
 1794 genJump (CmmLit lit) regs =
 1795   return (unitOL (JMP (OpImm (litToImm lit)) regs))
 1796 
 1797 genJump expr regs = do
 1798   (reg,code) <- getSomeReg expr
 1799   return (code `snocOL` JMP (OpReg reg) regs)
 1800 
 1801 
 1802 -- -----------------------------------------------------------------------------
 1803 --  Unconditional branches
 1804 
 1805 genBranch :: BlockId -> InstrBlock
 1806 genBranch = toOL . mkJumpInstr
 1807 
 1808 
 1809 
 1810 -- -----------------------------------------------------------------------------
 1811 --  Conditional jumps/branches
 1812 
 1813 {-
 1814 Conditional jumps are always to local labels, so we can use branch
 1815 instructions.  We peek at the arguments to decide what kind of
 1816 comparison to do.
 1817 
 1818 I386: First, we have to ensure that the condition
 1819 codes are set according to the supplied comparison operation.
 1820 -}
 1821 
 1822 {-  Note [64-bit integer comparisons on 32-bit]
 1823     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 1824 
 1825     When doing these comparisons there are 2 kinds of
 1826     comparisons.
 1827 
 1828     * Comparison for equality (or lack thereof)
 1829 
 1830     We use xor to check if high/low bits are
 1831     equal. Then combine the results using or and
 1832     perform a single conditional jump based on the
 1833     result.
 1834 
 1835     * Other comparisons:
 1836 
 1837     We map all other comparisons to the >= operation.
 1838     Why? Because it's easy to encode it with a single
 1839     conditional jump.
 1840 
 1841     We do this by first computing [r1_lo - r2_lo]
 1842     and use the carry flag to compute
 1843     [r1_high - r2_high - CF].
 1844 
 1845     At which point if r1 >= r2 then the result will be
 1846     positive. Otherwise negative so we can branch on this
 1847     condition.
 1848 
 1849 -}
 1850 
 1851 
 1852 genCondBranch
 1853     :: BlockId      -- the source of the jump
 1854     -> BlockId      -- the true branch target
 1855     -> BlockId      -- the false branch target
 1856     -> CmmExpr      -- the condition on which to branch
 1857     -> NatM InstrBlock -- Instructions
 1858 
 1859 genCondBranch bid id false expr = do
 1860   is32Bit <- is32BitPlatform
 1861   genCondBranch' is32Bit bid id false expr
 1862 
 1863 -- | We return the instructions generated.
 1864 genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr
 1865                -> NatM InstrBlock
 1866 
 1867 -- 64-bit integer comparisons on 32-bit
 1868 -- See Note [64-bit integer comparisons on 32-bit]
 1869 genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2])
 1870   | is32Bit, Just W64 <- maybeIntComparison mop = do
 1871 
 1872   -- The resulting registers here are both the lower part of
 1873   -- the register as well as a way to get at the higher part.
 1874   ChildCode64 code1 r1 <- iselExpr64 e1
 1875   ChildCode64 code2 r2 <- iselExpr64 e2
 1876   let cond = machOpToCond mop :: Cond
 1877 
 1878   let cmpCode = intComparison cond true false r1 r2
 1879   return $ code1 `appOL` code2 `appOL` cmpCode
 1880 
 1881   where
 1882     intComparison :: Cond -> BlockId -> BlockId -> Reg -> Reg -> InstrBlock
 1883     intComparison cond true false r1_lo r2_lo =
 1884       case cond of
 1885         -- Impossible results of machOpToCond
 1886         ALWAYS  -> panic "impossible"
 1887         NEG     -> panic "impossible"
 1888         POS     -> panic "impossible"
 1889         CARRY   -> panic "impossible"
 1890         OFLO    -> panic "impossible"
 1891         PARITY  -> panic "impossible"
 1892         NOTPARITY -> panic "impossible"
 1893         -- Special case #1 x == y and x != y
 1894         EQQ -> cmpExact
 1895         NE  -> cmpExact
 1896         -- [x >= y]
 1897         GE  -> cmpGE
 1898         GEU -> cmpGE
 1899         -- [x >  y] <==> ![y >= x]
 1900         GTT -> intComparison GE  false true r2_lo r1_lo
 1901         GU  -> intComparison GEU false true r2_lo r1_lo
 1902         -- [x <= y] <==> [y >= x]
 1903         LE  -> intComparison GE  true false r2_lo r1_lo
 1904         LEU -> intComparison GEU true false r2_lo r1_lo
 1905         -- [x <  y] <==> ![x >= x]
 1906         LTT -> intComparison GE  false true r1_lo r2_lo
 1907         LU  -> intComparison GEU false true r1_lo r2_lo
 1908       where
 1909         r1_hi = getHiVRegFromLo r1_lo
 1910         r2_hi = getHiVRegFromLo r2_lo
 1911         cmpExact :: OrdList Instr
 1912         cmpExact =
 1913           toOL
 1914             [ XOR II32 (OpReg r2_hi) (OpReg r1_hi)
 1915             , XOR II32 (OpReg r2_lo) (OpReg r1_lo)
 1916             , OR  II32 (OpReg r1_hi)  (OpReg r1_lo)
 1917             , JXX cond true
 1918             , JXX ALWAYS false
 1919             ]
 1920         cmpGE = toOL
 1921             [ CMP II32 (OpReg r2_lo) (OpReg r1_lo)
 1922             , SBB II32 (OpReg r2_hi) (OpReg r1_hi)
 1923             , JXX cond true
 1924             , JXX ALWAYS false ]
 1925 
 1926 genCondBranch' _ bid id false bool = do
 1927   CondCode is_float cond cond_code <- getCondCode bool
 1928   use_sse2 <- sse2Enabled
 1929   if not is_float || not use_sse2
 1930     then
 1931         return (cond_code `snocOL` JXX cond id `appOL` genBranch false)
 1932     else do
 1933         -- See Note [SSE Parity Checks]
 1934         let jmpFalse = genBranch false
 1935             code
 1936                 = case cond of
 1937                   NE  -> or_unordered
 1938                   GU  -> plain_test
 1939                   GEU -> plain_test
 1940                   -- Use ASSERT so we don't break releases if
 1941                   -- LTT/LE creep in somehow.
 1942                   LTT ->
 1943                     assertPpr False (ppr "Should have been turned into >")
 1944                     and_ordered
 1945                   LE  ->
 1946                     assertPpr False (ppr "Should have been turned into >=")
 1947                     and_ordered
 1948                   _   -> and_ordered
 1949 
 1950             plain_test = unitOL (
 1951                   JXX cond id
 1952                 ) `appOL` jmpFalse
 1953             or_unordered = toOL [
 1954                   JXX cond id,
 1955                   JXX PARITY id
 1956                 ] `appOL` jmpFalse
 1957             and_ordered = toOL [
 1958                   JXX PARITY false,
 1959                   JXX cond id,
 1960                   JXX ALWAYS false
 1961                 ]
 1962         updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
 1963         return (cond_code `appOL` code)
 1964 
 1965 {-  Note [Introducing cfg edges inside basic blocks]
 1966     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 1967 
 1968     During instruction selection a statement `s`
 1969     in a block B with control of the sort: B -> C
 1970     will sometimes result in control
 1971     flow of the sort:
 1972 
 1973             ┌ < ┐
 1974             v   ^
 1975       B ->  B1  ┴ -> C
 1976 
 1977     as is the case for some atomic operations.
 1978 
 1979     Now to keep the CFG in sync when introducing B1 we clearly
 1980     want to insert it between B and C. However there is
 1981     a catch when we have to deal with self loops.
 1982 
 1983     We might start with code and a CFG of these forms:
 1984 
 1985     loop:
 1986         stmt1               ┌ < ┐
 1987         ....                v   ^
 1988         stmtX              loop ┘
 1989         stmtY
 1990         ....
 1991         goto loop:
 1992 
 1993     Now we introduce B1:
 1994                             ┌ ─ ─ ─ ─ ─┐
 1995         loop:               │   ┌ <  ┐ │
 1996         instrs              v   │    │ ^
 1997         ....               loop ┴ B1 ┴ ┘
 1998         instrsFromX
 1999         stmtY
 2000         goto loop:
 2001 
 2002     This is simple, all outgoing edges from loop now simply
 2003     start from B1 instead and the code generator knows which
 2004     new edges it introduced for the self loop of B1.
 2005 
 2006     Disaster strikes if the statement Y follows the same pattern.
 2007     If we apply the same rule that all outgoing edges change then
 2008     we end up with:
 2009 
 2010         loop ─> B1 ─> B2 ┬─┐
 2011           │      │    └─<┤ │
 2012           │      └───<───┘ │
 2013           └───────<────────┘
 2014 
 2015     This is problematic. The edge B1->B1 is modified as expected.
 2016     However the modification is wrong!
 2017 
 2018     The assembly in this case looked like this:
 2019 
 2020     _loop:
 2021         <instrs>
 2022     _B1:
 2023         ...
 2024         cmpxchgq ...
 2025         jne _B1
 2026         <instrs>
 2027         <end _B1>
 2028     _B2:
 2029         ...
 2030         cmpxchgq ...
 2031         jne _B2
 2032         <instrs>
 2033         jmp loop
 2034 
 2035     There is no edge _B2 -> _B1 here. It's still a self loop onto _B1.
 2036 
 2037     The problem here is that really B1 should be two basic blocks.
 2038     Otherwise we have control flow in the *middle* of a basic block.
 2039     A contradiction!
 2040 
 2041     So to account for this we add yet another basic block marker:
 2042 
 2043     _B:
 2044         <instrs>
 2045     _B1:
 2046         ...
 2047         cmpxchgq ...
 2048         jne _B1
 2049         jmp _B1'
 2050     _B1':
 2051         <instrs>
 2052         <end _B1>
 2053     _B2:
 2054         ...
 2055 
 2056     Now when inserting B2 we will only look at the outgoing edges of B1' and
 2057     everything will work out nicely.
 2058 
 2059     You might also wonder why we don't insert jumps at the end of _B1'. There is
 2060     no way another block ends up jumping to the labels _B1 or _B2 since they are
 2061     essentially invisible to other blocks. View them as control flow labels local
 2062     to the basic block if you'd like.
 2063 
 2064     Not doing this ultimately caused (part 2 of) #17334.
 2065 -}
 2066 
 2067 
 2068 -- -----------------------------------------------------------------------------
 2069 --  Generating C calls
 2070 
 2071 -- Now the biggest nightmare---calls.  Most of the nastiness is buried in
 2072 -- @get_arg@, which moves the arguments to the correct registers/stack
 2073 -- locations.  Apart from that, the code is easy.
 2074 --
 2075 -- (If applicable) Do not fill the delay slots here; you will confuse the
 2076 -- register allocator.
 2077 --
 2078 -- See Note [Keeping track of the current block] for information why we need
 2079 -- to take/return a block id.
 2080 
 2081 genCCall
 2082     :: Bool                     -- 32 bit platform?
 2083     -> ForeignTarget            -- function to call
 2084     -> [CmmFormal]        -- where to put the result
 2085     -> [CmmActual]        -- arguments (of mixed type)
 2086     -> BlockId      -- The block we are in
 2087     -> NatM (InstrBlock, Maybe BlockId)
 2088 
 2089 -- First we deal with cases which might introduce new blocks in the stream.
 2090 
 2091 genCCall is32Bit (PrimTarget (MO_AtomicRMW width amop))
 2092                                            [dst] [addr, n] bid = do
 2093     Amode amode addr_code <-
 2094         if amop `elem` [AMO_Add, AMO_Sub]
 2095         then getAmode addr
 2096         else getSimpleAmode is32Bit addr  -- See genCCall for MO_Cmpxchg
 2097     arg <- getNewRegNat format
 2098     arg_code <- getAnyReg n
 2099     platform <- ncgPlatform <$> getConfig
 2100 
 2101     let dst_r    = getRegisterReg platform  (CmmLocal dst)
 2102     (code, lbl) <- op_code dst_r arg amode
 2103     return (addr_code `appOL` arg_code arg `appOL` code, Just lbl)
 2104   where
 2105     -- Code for the operation
 2106     op_code :: Reg       -- Destination reg
 2107             -> Reg       -- Register containing argument
 2108             -> AddrMode  -- Address of location to mutate
 2109             -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId
 2110     op_code dst_r arg amode = case amop of
 2111         -- In the common case where dst_r is a virtual register the
 2112         -- final move should go away, because it's the last use of arg
 2113         -- and the first use of dst_r.
 2114         AMO_Add  -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
 2115                                    , MOV format (OpReg arg) (OpReg dst_r)
 2116                                    ], bid)
 2117         AMO_Sub  -> return $ (toOL [ NEGI format (OpReg arg)
 2118                                    , LOCK (XADD format (OpReg arg) (OpAddr amode))
 2119                                    , MOV format (OpReg arg) (OpReg dst_r)
 2120                                    ], bid)
 2121         -- In these cases we need a new block id, and have to return it so
 2122         -- that later instruction selection can reference it.
 2123         AMO_And  -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
 2124         AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
 2125                                                     , NOT format dst
 2126                                                     ])
 2127         AMO_Or   -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
 2128         AMO_Xor  -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
 2129       where
 2130         -- Simulate operation that lacks a dedicated instruction using
 2131         -- cmpxchg.
 2132         cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
 2133                      -> NatM (OrdList Instr, BlockId)
 2134         cmpxchg_code instrs = do
 2135             lbl1 <- getBlockIdNat
 2136             lbl2 <- getBlockIdNat
 2137             tmp <- getNewRegNat format
 2138 
 2139             --Record inserted blocks
 2140             --  We turn A -> B into A -> A' -> A'' -> B
 2141             --  with a self loop on A'.
 2142             addImmediateSuccessorNat bid lbl1
 2143             addImmediateSuccessorNat lbl1 lbl2
 2144             updateCfgNat (addWeightEdge lbl1 lbl1 0)
 2145 
 2146             return $ (toOL
 2147                 [ MOV format (OpAddr amode) (OpReg eax)
 2148                 , JXX ALWAYS lbl1
 2149                 , NEWBLOCK lbl1
 2150                   -- Keep old value so we can return it:
 2151                 , MOV format (OpReg eax) (OpReg dst_r)
 2152                 , MOV format (OpReg eax) (OpReg tmp)
 2153                 ]
 2154                 `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
 2155                 [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
 2156                 , JXX NE lbl1
 2157                 -- See Note [Introducing cfg edges inside basic blocks]
 2158                 -- why this basic block is required.
 2159                 , JXX ALWAYS lbl2
 2160                 , NEWBLOCK lbl2
 2161                 ],
 2162                 lbl2)
 2163     format = intFormat width
 2164 
 2165 genCCall is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
 2166   | is32Bit, width == W64 = do
 2167       ChildCode64 vcode rlo <- iselExpr64 src
 2168       platform <- ncgPlatform <$> getConfig
 2169       let rhi     = getHiVRegFromLo rlo
 2170           dst_r   = getRegisterReg platform  (CmmLocal dst)
 2171       lbl1 <- getBlockIdNat
 2172       lbl2 <- getBlockIdNat
 2173       let format = if width == W8 then II16 else intFormat width
 2174       tmp_r <- getNewRegNat format
 2175 
 2176       -- New CFG Edges:
 2177       --  bid -> lbl2
 2178       --  bid -> lbl1 -> lbl2
 2179       --  We also changes edges originating at bid to start at lbl2 instead.
 2180       weights <- getCfgWeights
 2181       updateCfgNat (addWeightEdge bid lbl1 110 .
 2182                     addWeightEdge lbl1 lbl2 110 .
 2183                     addImmediateSuccessor weights bid lbl2)
 2184 
 2185       -- The following instruction sequence corresponds to the pseudo-code
 2186       --
 2187       --  if (src) {
 2188       --    dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
 2189       --  } else {
 2190       --    dst = 64;
 2191       --  }
 2192       let !instrs = vcode `appOL` toOL
 2193                ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r)
 2194                 , OR       II32 (OpReg rlo)         (OpReg tmp_r)
 2195                 , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r)
 2196                 , JXX EQQ    lbl2
 2197                 , JXX ALWAYS lbl1
 2198 
 2199                 , NEWBLOCK   lbl1
 2200                 , BSF     II32 (OpReg rhi)         dst_r
 2201                 , ADD     II32 (OpImm (ImmInt 32)) (OpReg dst_r)
 2202                 , BSF     II32 (OpReg rlo)         tmp_r
 2203                 , CMOV NE II32 (OpReg tmp_r)       dst_r
 2204                 , JXX ALWAYS lbl2
 2205 
 2206                 , NEWBLOCK   lbl2
 2207                 ])
 2208       return (instrs, Just lbl2)
 2209 
 2210   | otherwise = do
 2211     code_src <- getAnyReg src
 2212     config <- getConfig
 2213     let platform = ncgPlatform config
 2214     let dst_r = getRegisterReg platform (CmmLocal dst)
 2215     if ncgBmiVersion config >= Just BMI2
 2216     then do
 2217         src_r <- getNewRegNat (intFormat width)
 2218         let instrs = appOL (code_src src_r) $ case width of
 2219                 W8 -> toOL
 2220                     [ OR    II32 (OpImm (ImmInteger 0xFFFFFF00)) (OpReg src_r)
 2221                     , TZCNT II32 (OpReg src_r)        dst_r
 2222                     ]
 2223                 W16 -> toOL
 2224                     [ TZCNT  II16 (OpReg src_r) dst_r
 2225                     , MOVZxL II16 (OpReg dst_r) (OpReg dst_r)
 2226                     ]
 2227                 _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r
 2228         return (instrs, Nothing)
 2229     else do
 2230         -- The following insn sequence makes sure 'ctz 0' has a defined value.
 2231         -- starting with Haswell, one could use the TZCNT insn instead.
 2232         let format = if width == W8 then II16 else intFormat width
 2233         src_r <- getNewRegNat format
 2234         tmp_r <- getNewRegNat format
 2235         let !instrs = code_src src_r `appOL` toOL
 2236                  ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
 2237                   [ BSF     format (OpReg src_r) tmp_r
 2238                   , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
 2239                   , CMOV NE format (OpReg tmp_r) dst_r
 2240                   ]) -- NB: We don't need to zero-extend the result for the
 2241                      -- W8/W16 cases because the 'MOV' insn already
 2242                      -- took care of implicitly clearing the upper bits
 2243         return (instrs, Nothing)
 2244   where
 2245     bw = widthInBits width
 2246 
 2247 genCCall bits mop dst args bid = do
 2248   config <- getConfig
 2249   instr <- genCCall' config bits mop dst args bid
 2250   return (instr, Nothing)
 2251 
 2252 -- genCCall' handles cases not introducing new code blocks.
 2253 genCCall'
 2254     :: NCGConfig
 2255     -> Bool                     -- 32 bit platform?
 2256     -> ForeignTarget            -- function to call
 2257     -> [CmmFormal]        -- where to put the result
 2258     -> [CmmActual]        -- arguments (of mixed type)
 2259     -> BlockId      -- The block we are in
 2260     -> NatM InstrBlock
 2261 
 2262 -- Unroll memcpy calls if the number of bytes to copy isn't too
 2263 -- large.  Otherwise, call C's memcpy.
 2264 genCCall' config _ (PrimTarget (MO_Memcpy align)) _
 2265          [dst, src, CmmLit (CmmInt n _)] _
 2266     | fromInteger insns <= ncgInlineThresholdMemcpy config = do
 2267         code_dst <- getAnyReg dst
 2268         dst_r <- getNewRegNat format
 2269         code_src <- getAnyReg src
 2270         src_r <- getNewRegNat format
 2271         tmp_r <- getNewRegNat format
 2272         return $ code_dst dst_r `appOL` code_src src_r `appOL`
 2273             go dst_r src_r tmp_r (fromInteger n)
 2274   where
 2275     platform = ncgPlatform config
 2276     -- The number of instructions we will generate (approx). We need 2
 2277     -- instructions per move.
 2278     insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
 2279 
 2280     maxAlignment = wordAlignment platform -- only machine word wide MOVs are supported
 2281     effectiveAlignment = min (alignmentOf align) maxAlignment
 2282     format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
 2283 
 2284     -- The size of each move, in bytes.
 2285     sizeBytes :: Integer
 2286     sizeBytes = fromIntegral (formatInBytes format)
 2287 
 2288     go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
 2289     go dst src tmp i
 2290         | i >= sizeBytes =
 2291             unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL`
 2292             unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL`
 2293             go dst src tmp (i - sizeBytes)
 2294         -- Deal with remaining bytes.
 2295         | i >= 4 =  -- Will never happen on 32-bit
 2296             unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
 2297             unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
 2298             go dst src tmp (i - 4)
 2299         | i >= 2 =
 2300             unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
 2301             unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
 2302             go dst src tmp (i - 2)
 2303         | i >= 1 =
 2304             unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
 2305             unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
 2306             go dst src tmp (i - 1)
 2307         | otherwise = nilOL
 2308       where
 2309         src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone
 2310                    (ImmInteger (n - i))
 2311         dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
 2312                    (ImmInteger (n - i))
 2313 
 2314 genCCall' config _ (PrimTarget (MO_Memset align)) _
 2315          [dst,
 2316           CmmLit (CmmInt c _),
 2317           CmmLit (CmmInt n _)]
 2318          _
 2319     | fromInteger insns <= ncgInlineThresholdMemset config = do
 2320         code_dst <- getAnyReg dst
 2321         dst_r <- getNewRegNat format
 2322         if format == II64 && n >= 8 then do
 2323           code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64))
 2324           imm8byte_r <- getNewRegNat II64
 2325           return $ code_dst dst_r `appOL`
 2326                    code_imm8byte imm8byte_r `appOL`
 2327                    go8 dst_r imm8byte_r (fromInteger n)
 2328         else
 2329           return $ code_dst dst_r `appOL`
 2330                    go4 dst_r (fromInteger n)
 2331   where
 2332     platform = ncgPlatform config
 2333     maxAlignment = wordAlignment platform -- only machine word wide MOVs are supported
 2334     effectiveAlignment = min (alignmentOf align) maxAlignment
 2335     format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
 2336     c2 = c `shiftL` 8 .|. c
 2337     c4 = c2 `shiftL` 16 .|. c2
 2338     c8 = c4 `shiftL` 32 .|. c4
 2339 
 2340     -- The number of instructions we will generate (approx). We need 1
 2341     -- instructions per move.
 2342     insns = (n + sizeBytes - 1) `div` sizeBytes
 2343 
 2344     -- The size of each move, in bytes.
 2345     sizeBytes :: Integer
 2346     sizeBytes = fromIntegral (formatInBytes format)
 2347 
 2348     -- Depending on size returns the widest MOV instruction and its
 2349     -- width.
 2350     gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
 2351     gen4 addr size
 2352         | size >= 4 =
 2353             (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
 2354         | size >= 2 =
 2355             (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
 2356         | size >= 1 =
 2357             (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
 2358         | otherwise = (nilOL, 0)
 2359 
 2360     -- Generates a 64-bit wide MOV instruction from REG to MEM.
 2361     gen8 :: AddrMode -> Reg -> InstrBlock
 2362     gen8 addr reg8byte =
 2363       unitOL (MOV format (OpReg reg8byte) (OpAddr addr))
 2364 
 2365     -- Unrolls memset when the widest MOV is <= 4 bytes.
 2366     go4 :: Reg -> Integer -> InstrBlock
 2367     go4 dst left =
 2368       if left <= 0 then nilOL
 2369       else curMov `appOL` go4 dst (left - curWidth)
 2370       where
 2371         possibleWidth = minimum [left, sizeBytes]
 2372         dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
 2373         (curMov, curWidth) = gen4 dst_addr possibleWidth
 2374 
 2375     -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg
 2376     -- argument). Falls back to go4 when all 8 byte moves are
 2377     -- exhausted.
 2378     go8 :: Reg -> Reg -> Integer -> InstrBlock
 2379     go8 dst reg8byte left =
 2380       if possibleWidth >= 8 then
 2381         let curMov = gen8 dst_addr reg8byte
 2382         in  curMov `appOL` go8 dst reg8byte (left - 8)
 2383       else go4 dst left
 2384       where
 2385         possibleWidth = minimum [left, sizeBytes]
 2386         dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
 2387 
 2388 genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _  = return nilOL
 2389 genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
 2390         -- barriers compile to no code on x86/x86-64;
 2391         -- we keep it this long in order to prevent earlier optimisations.
 2392 
 2393 genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
 2394 
 2395 genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _  [src] _ =
 2396         case n of
 2397             0 -> genPrefetch src $ PREFETCH NTA  format
 2398             1 -> genPrefetch src $ PREFETCH Lvl2 format
 2399             2 -> genPrefetch src $ PREFETCH Lvl1 format
 2400             3 -> genPrefetch src $ PREFETCH Lvl0 format
 2401             l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l)
 2402             -- the c / llvm prefetch convention is 0, 1, 2, and 3
 2403             -- the x86 corresponding names are : NTA, 2 , 1, and 0
 2404    where
 2405         format = archWordFormat is32bit
 2406         -- need to know what register width for pointers!
 2407         genPrefetch inRegSrc prefetchCTor =
 2408             do
 2409                 code_src <- getAnyReg inRegSrc
 2410                 src_r <- getNewRegNat format
 2411                 return $ code_src src_r `appOL`
 2412                   (unitOL (prefetchCTor  (OpAddr
 2413                               ((AddrBaseIndex (EABaseReg src_r )   EAIndexNone (ImmInt 0))))  ))
 2414                   -- prefetch always takes an address
 2415 
 2416 genCCall' _ is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
 2417     platform <- ncgPlatform <$> getConfig
 2418     let dst_r = getRegisterReg platform (CmmLocal dst)
 2419     case width of
 2420         W64 | is32Bit -> do
 2421                ChildCode64 vcode rlo <- iselExpr64 src
 2422                let dst_rhi = getHiVRegFromLo dst_r
 2423                    rhi     = getHiVRegFromLo rlo
 2424                return $ vcode `appOL`
 2425                         toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi),
 2426                                MOV II32 (OpReg rhi) (OpReg dst_r),
 2427                                BSWAP II32 dst_rhi,
 2428                                BSWAP II32 dst_r ]
 2429         W16 -> do code_src <- getAnyReg src
 2430                   return $ code_src dst_r `appOL`
 2431                            unitOL (BSWAP II32 dst_r) `appOL`
 2432                            unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r))
 2433         _   -> do code_src <- getAnyReg src
 2434                   return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r)
 2435   where
 2436     format = intFormat width
 2437 
 2438 genCCall' config is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
 2439          args@[src] bid = do
 2440     sse4_2 <- sse4_2Enabled
 2441     let platform = ncgPlatform config
 2442     if sse4_2
 2443         then do code_src <- getAnyReg src
 2444                 src_r <- getNewRegNat format
 2445                 let dst_r = getRegisterReg platform  (CmmLocal dst)
 2446                 return $ code_src src_r `appOL`
 2447                     (if width == W8 then
 2448                          -- The POPCNT instruction doesn't take a r/m8
 2449                          unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL`
 2450                          unitOL (POPCNT II16 (OpReg src_r) dst_r)
 2451                      else
 2452                          unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL`
 2453                     (if width == W8 || width == W16 then
 2454                          -- We used a 16-bit destination register above,
 2455                          -- so zero-extend
 2456                          unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
 2457                      else nilOL)
 2458         else do
 2459             targetExpr <- cmmMakeDynamicReference config
 2460                           CallReference lbl
 2461             let target = ForeignTarget targetExpr (ForeignConvention CCallConv
 2462                                                            [NoHint] [NoHint]
 2463                                                            CmmMayReturn)
 2464             genCCall' config is32Bit target dest_regs args bid
 2465   where
 2466     format = intFormat width
 2467     lbl = mkCmmCodeLabel primUnitId (popCntLabel width)
 2468 
 2469 genCCall' config is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
 2470          args@[src, mask] bid = do
 2471     let platform = ncgPlatform config
 2472     if ncgBmiVersion config >= Just BMI2
 2473         then do code_src  <- getAnyReg src
 2474                 code_mask <- getAnyReg mask
 2475                 src_r     <- getNewRegNat format
 2476                 mask_r    <- getNewRegNat format
 2477                 let dst_r = getRegisterReg platform  (CmmLocal dst)
 2478                 return $ code_src src_r `appOL` code_mask mask_r `appOL`
 2479                     -- PDEP only supports > 32 bit args
 2480                     ( if width == W8 || width == W16 then
 2481                         toOL
 2482                           [ MOVZxL format (OpReg src_r ) (OpReg src_r )
 2483                           , MOVZxL format (OpReg mask_r) (OpReg mask_r)
 2484                           , PDEP   II32 (OpReg mask_r) (OpReg src_r ) dst_r
 2485                           , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
 2486                           ]
 2487                       else
 2488                         unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)
 2489                     )
 2490         else do
 2491             targetExpr <- cmmMakeDynamicReference config
 2492                           CallReference lbl
 2493             let target = ForeignTarget targetExpr (ForeignConvention CCallConv
 2494                                                            [NoHint] [NoHint]
 2495                                                            CmmMayReturn)
 2496             genCCall' config is32Bit target dest_regs args bid
 2497   where
 2498     format = intFormat width
 2499     lbl = mkCmmCodeLabel primUnitId (pdepLabel width)
 2500 
 2501 genCCall' config is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
 2502          args@[src, mask] bid = do
 2503     let platform = ncgPlatform config
 2504     if ncgBmiVersion config >= Just BMI2
 2505         then do code_src  <- getAnyReg src
 2506                 code_mask <- getAnyReg mask
 2507                 src_r     <- getNewRegNat format
 2508                 mask_r    <- getNewRegNat format
 2509                 let dst_r = getRegisterReg platform  (CmmLocal dst)
 2510                 return $ code_src src_r `appOL` code_mask mask_r `appOL`
 2511                     (if width == W8 || width == W16 then
 2512                          -- The PEXT instruction doesn't take a r/m8 or 16
 2513                         toOL
 2514                           [ MOVZxL format (OpReg src_r ) (OpReg src_r )
 2515                           , MOVZxL format (OpReg mask_r) (OpReg mask_r)
 2516                           , PEXT   II32 (OpReg mask_r) (OpReg src_r ) dst_r
 2517                           , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
 2518                           ]
 2519                       else
 2520                         unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)
 2521                     )
 2522         else do
 2523             targetExpr <- cmmMakeDynamicReference config
 2524                           CallReference lbl
 2525             let target = ForeignTarget targetExpr (ForeignConvention CCallConv
 2526                                                            [NoHint] [NoHint]
 2527                                                            CmmMayReturn)
 2528             genCCall' config is32Bit target dest_regs args bid
 2529   where
 2530     format = intFormat width
 2531     lbl = mkCmmCodeLabel primUnitId (pextLabel width)
 2532 
 2533 genCCall' config is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
 2534   | is32Bit && width == W64 = do
 2535     -- Fallback to `hs_clz64` on i386
 2536     targetExpr <- cmmMakeDynamicReference config CallReference lbl
 2537     let target = ForeignTarget targetExpr (ForeignConvention CCallConv
 2538                                            [NoHint] [NoHint]
 2539                                            CmmMayReturn)
 2540     genCCall' config is32Bit target dest_regs args bid
 2541 
 2542   | otherwise = do
 2543     code_src <- getAnyReg src
 2544     config <- getConfig
 2545     let platform = ncgPlatform config
 2546     let dst_r = getRegisterReg platform (CmmLocal dst)
 2547     if ncgBmiVersion config >= Just BMI2
 2548         then do
 2549             src_r <- getNewRegNat (intFormat width)
 2550             return $ appOL (code_src src_r) $ case width of
 2551                 W8 -> toOL
 2552                     [ MOVZxL II8  (OpReg src_r)       (OpReg src_r) -- zero-extend to 32 bit
 2553                     , LZCNT  II32 (OpReg src_r)       dst_r         -- lzcnt with extra 24 zeros
 2554                     , SUB    II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros
 2555                     ]
 2556                 W16 -> toOL
 2557                     [ LZCNT  II16 (OpReg src_r) dst_r
 2558                     , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit
 2559                     ]
 2560                 _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r)
 2561         else do
 2562             let format = if width == W8 then II16 else intFormat width
 2563             src_r <- getNewRegNat format
 2564             tmp_r <- getNewRegNat format
 2565             return $ code_src src_r `appOL` toOL
 2566                      ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
 2567                       [ BSR     format (OpReg src_r) tmp_r
 2568                       , MOV     II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
 2569                       , CMOV NE format (OpReg tmp_r) dst_r
 2570                       , XOR     format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
 2571                       ]) -- NB: We don't need to zero-extend the result for the
 2572                          -- W8/W16 cases because the 'MOV' insn already
 2573                          -- took care of implicitly clearing the upper bits
 2574   where
 2575     bw = widthInBits width
 2576     lbl = mkCmmCodeLabel primUnitId (clzLabel width)
 2577 
 2578 genCCall' config is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
 2579     targetExpr <- cmmMakeDynamicReference config
 2580                   CallReference lbl
 2581     let target = ForeignTarget targetExpr (ForeignConvention CCallConv
 2582                                            [NoHint] [NoHint]
 2583                                            CmmMayReturn)
 2584     genCCall' config is32Bit target dest_regs args bid
 2585   where
 2586     lbl = mkCmmCodeLabel primUnitId (word2FloatLabel width)
 2587 
 2588 genCCall' _ _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
 2589   load_code <- intLoadCode (MOV (intFormat width)) addr
 2590   platform <- ncgPlatform <$> getConfig
 2591 
 2592   return (load_code (getRegisterReg platform  (CmmLocal dst)))
 2593 
 2594 genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
 2595     code <- assignMem_IntCode (intFormat width) addr val
 2596     return $ code `snocOL` MFENCE
 2597 
 2598 genCCall' _ is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _
 2599     -- On x86 we don't have enough registers to use cmpxchg with a
 2600     -- complicated addressing mode, so on that architecture we
 2601     -- pre-compute the address first.
 2602   | not (is32Bit && width == W64) = do
 2603     Amode amode addr_code <- getSimpleAmode is32Bit addr
 2604     newval <- getNewRegNat format
 2605     newval_code <- getAnyReg new
 2606     oldval <- getNewRegNat format
 2607     oldval_code <- getAnyReg old
 2608     platform <- getPlatform
 2609     let dst_r    = getRegisterReg platform  (CmmLocal dst)
 2610         code     = toOL
 2611                    [ MOV format (OpReg oldval) (OpReg eax)
 2612                    , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
 2613                    , MOV format (OpReg eax) (OpReg dst_r)
 2614                    ]
 2615     return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
 2616         `appOL` code
 2617   where
 2618     format = intFormat width
 2619 
 2620 genCCall' config is32Bit (PrimTarget (MO_Xchg width)) [dst] [addr, value] _
 2621   | (is32Bit && width == W64) = panic "gencCall: 64bit atomic exchange not supported on 32bit platforms"
 2622   | otherwise = do
 2623     let dst_r = getRegisterReg platform (CmmLocal dst)
 2624     Amode amode addr_code <- getSimpleAmode is32Bit addr
 2625     (newval, newval_code) <- getSomeReg value
 2626     -- Copy the value into the target register, perform the exchange.
 2627     let code     = toOL
 2628                    [ MOV format (OpReg newval) (OpReg dst_r)
 2629                     -- On X86 xchg implies a lock prefix if we use a memory argument.
 2630                     -- so this is atomic.
 2631                    , XCHG format (OpAddr amode) dst_r
 2632                    ]
 2633     return $ addr_code `appOL` newval_code `appOL` code
 2634   where
 2635     format = intFormat width
 2636     platform = ncgPlatform config
 2637 
 2638 genCCall' _ is32Bit target dest_regs args bid = do
 2639   platform <- ncgPlatform <$> getConfig
 2640   case (target, dest_regs) of
 2641     -- void return type prim op
 2642     (PrimTarget op, []) ->
 2643         outOfLineCmmOp bid op Nothing args
 2644     -- we only cope with a single result for foreign calls
 2645     (PrimTarget op, [r])  -> case op of
 2646           MO_F32_Fabs -> case args of
 2647             [x] -> sse2FabsCode W32 x
 2648             _ -> panic "genCCall: Wrong number of arguments for fabs"
 2649           MO_F64_Fabs -> case args of
 2650             [x] -> sse2FabsCode W64 x
 2651             _ -> panic "genCCall: Wrong number of arguments for fabs"
 2652 
 2653           MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
 2654           MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
 2655           _other_op -> outOfLineCmmOp bid op (Just r) args
 2656 
 2657        where
 2658         actuallyInlineSSE2Op = actuallyInlineFloatOp'
 2659 
 2660         actuallyInlineFloatOp'  instr format [x]
 2661               = do res <- trivialUFCode format (instr format) x
 2662                    any <- anyReg res
 2663                    return (any (getRegisterReg platform  (CmmLocal r)))
 2664 
 2665         actuallyInlineFloatOp' _ _ args
 2666               = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
 2667                       ++ show (length args) ++ ")"
 2668 
 2669         sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock
 2670         sse2FabsCode w x = do
 2671           let fmt = floatFormat w
 2672           x_code <- getAnyReg x
 2673           let
 2674             const | FF32 <- fmt = CmmInt 0x7fffffff W32
 2675                   | otherwise   = CmmInt 0x7fffffffffffffff W64
 2676           Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
 2677           tmp <- getNewRegNat fmt
 2678           let
 2679             code dst = x_code dst `appOL` amode_code `appOL` toOL [
 2680                 MOV fmt (OpAddr amode) (OpReg tmp),
 2681                 AND fmt (OpReg tmp) (OpReg dst)
 2682                 ]
 2683 
 2684           return $ code (getRegisterReg platform (CmmLocal r))
 2685 
 2686     (PrimTarget (MO_S_QuotRem  width), _) -> divOp1 platform True  width dest_regs args
 2687     (PrimTarget (MO_U_QuotRem  width), _) -> divOp1 platform False width dest_regs args
 2688     (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
 2689     (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
 2690         case args of
 2691         [arg_x, arg_y] ->
 2692             do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
 2693                let format = intFormat width
 2694                lCode <- anyReg =<< trivialCode width (ADD_CC format)
 2695                                      (Just (ADD_CC format)) arg_x arg_y
 2696                let reg_l = getRegisterReg platform (CmmLocal res_l)
 2697                    reg_h = getRegisterReg platform (CmmLocal res_h)
 2698                    code = hCode reg_h `appOL`
 2699                           lCode reg_l `snocOL`
 2700                           ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
 2701                return code
 2702         _ -> panic "genCCall: Wrong number of arguments/results for add2"
 2703     (PrimTarget (MO_AddWordC width), [res_r, res_c]) ->
 2704         addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args
 2705     (PrimTarget (MO_SubWordC width), [res_r, res_c]) ->
 2706         addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args
 2707     (PrimTarget (MO_AddIntC width), [res_r, res_c]) ->
 2708         addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args
 2709     (PrimTarget (MO_SubIntC width), [res_r, res_c]) ->
 2710         addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args
 2711     (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
 2712         case args of
 2713         [arg_x, arg_y] ->
 2714             do (y_reg, y_code) <- getRegOrMem arg_y
 2715                x_code <- getAnyReg arg_x
 2716                let format = intFormat width
 2717                    reg_h = getRegisterReg platform (CmmLocal res_h)
 2718                    reg_l = getRegisterReg platform (CmmLocal res_l)
 2719                    code = y_code `appOL`
 2720                           x_code rax `appOL`
 2721                           toOL [MUL2 format y_reg,
 2722                                 MOV format (OpReg rdx) (OpReg reg_h),
 2723                                 MOV format (OpReg rax) (OpReg reg_l)]
 2724                return code
 2725         _ -> panic "genCCall: Wrong number of arguments/results for mul2"
 2726     (PrimTarget (MO_S_Mul2 width), [res_c, res_h, res_l]) ->
 2727         case args of
 2728         [arg_x, arg_y] ->
 2729             do (y_reg, y_code) <- getRegOrMem arg_y
 2730                x_code <- getAnyReg arg_x
 2731                reg_tmp <- getNewRegNat II8
 2732                let format = intFormat width
 2733                    reg_h = getRegisterReg platform (CmmLocal res_h)
 2734                    reg_l = getRegisterReg platform (CmmLocal res_l)
 2735                    reg_c = getRegisterReg platform (CmmLocal res_c)
 2736                    code = y_code `appOL`
 2737                           x_code rax `appOL`
 2738                           toOL [ IMUL2 format y_reg
 2739                                , MOV format (OpReg rdx) (OpReg reg_h)
 2740                                , MOV format (OpReg rax) (OpReg reg_l)
 2741                                , SETCC CARRY (OpReg reg_tmp)
 2742                                , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
 2743                                ]
 2744                return code
 2745         _ -> panic "genCCall: Wrong number of arguments/results for imul2"
 2746 
 2747     _ -> do
 2748         (instrs0, args') <- evalArgs bid args
 2749         instrs1 <- if is32Bit
 2750           then genCCall32' target dest_regs args'
 2751           else genCCall64' target dest_regs args'
 2752         return (instrs0 `appOL` instrs1)
 2753 
 2754   where divOp1 platform signed width results [arg_x, arg_y]
 2755             = divOp platform signed width results Nothing arg_x arg_y
 2756         divOp1 _ _ _ _ _
 2757             = panic "genCCall: Wrong number of arguments for divOp1"
 2758         divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
 2759             = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
 2760         divOp2 _ _ _ _ _
 2761             = panic "genCCall: Wrong number of arguments for divOp2"
 2762 
 2763         -- See Note [DIV/IDIV for bytes]
 2764         divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
 2765             let widen | signed = MO_SS_Conv W8 W16
 2766                       | otherwise = MO_UU_Conv W8 W16
 2767                 arg_x_low_16 = CmmMachOp widen [arg_x_low]
 2768                 arg_y_16 = CmmMachOp widen [arg_y]
 2769                 m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
 2770             in divOp
 2771                   platform signed W16 [res_q, res_r]
 2772                   m_arg_x_high_16 arg_x_low_16 arg_y_16
 2773 
 2774         divOp platform signed width [res_q, res_r]
 2775               m_arg_x_high arg_x_low arg_y
 2776             = do let format = intFormat width
 2777                      reg_q = getRegisterReg platform (CmmLocal res_q)
 2778                      reg_r = getRegisterReg platform (CmmLocal res_r)
 2779                      widen | signed    = CLTD format
 2780                            | otherwise = XOR format (OpReg rdx) (OpReg rdx)
 2781                      instr | signed    = IDIV
 2782                            | otherwise = DIV
 2783                  (y_reg, y_code) <- getRegOrMem arg_y
 2784                  x_low_code <- getAnyReg arg_x_low
 2785                  x_high_code <- case m_arg_x_high of
 2786                                 Just arg_x_high ->
 2787                                     getAnyReg arg_x_high
 2788                                 Nothing ->
 2789                                     return $ const $ unitOL widen
 2790                  return $ y_code `appOL`
 2791                           x_low_code rax `appOL`
 2792                           x_high_code rdx `appOL`
 2793                           toOL [instr format y_reg,
 2794                                 MOV format (OpReg rax) (OpReg reg_q),
 2795                                 MOV format (OpReg rdx) (OpReg reg_r)]
 2796         divOp _ _ _ _ _ _ _
 2797             = panic "genCCall: Wrong number of results for divOp"
 2798 
 2799         addSubIntC platform instr mrevinstr cond width
 2800                    res_r res_c [arg_x, arg_y]
 2801             = do let format = intFormat width
 2802                  rCode <- anyReg =<< trivialCode width (instr format)
 2803                                        (mrevinstr format) arg_x arg_y
 2804                  reg_tmp <- getNewRegNat II8
 2805                  let reg_c = getRegisterReg platform  (CmmLocal res_c)
 2806                      reg_r = getRegisterReg platform  (CmmLocal res_r)
 2807                      code = rCode reg_r `snocOL`
 2808                             SETCC cond (OpReg reg_tmp) `snocOL`
 2809                             MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
 2810 
 2811                  return code
 2812         addSubIntC _ _ _ _ _ _ _ _
 2813             = panic "genCCall: Wrong number of arguments/results for addSubIntC"
 2814 
 2815 {-
 2816 Note [Evaluate C-call arguments before placing in destination registers]
 2817 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 2818 
 2819 When producing code for C calls we must take care when placing arguments
 2820 in their final registers. Specifically, we must ensure that temporary register
 2821 usage due to evaluation of one argument does not clobber a register in which we
 2822 already placed a previous argument (e.g. as the code generation logic for
 2823 MO_Shl can clobber %rcx due to x86 instruction limitations).
 2824 
 2825 This is precisely what happened in #18527. Consider this C--:
 2826 
 2827     (result::I64) = call "ccall" doSomething(_s2hp::I64, 2244, _s2hq::I64, _s2hw::I64 | (1 << _s2hz::I64));
 2828 
 2829 Here we are calling the C function `doSomething` with three arguments, the last
 2830 involving a non-trivial expression involving MO_Shl. In this case the NCG could
 2831 naively generate the following assembly (where $tmp denotes some temporary
 2832 register and $argN denotes the register for argument N, as dictated by the
 2833 platform's calling convention):
 2834 
 2835     mov _s2hp, $arg1   # place first argument
 2836     mov _s2hq, $arg2   # place second argument
 2837 
 2838     # Compute 1 << _s2hz
 2839     mov _s2hz, %rcx
 2840     shl %cl, $tmp
 2841 
 2842     # Compute (_s2hw | (1 << _s2hz))
 2843     mov _s2hw, $arg3
 2844     or $tmp, $arg3
 2845 
 2846     # Perform the call
 2847     call func
 2848 
 2849 This code is outright broken on Windows which assigns $arg1 to %rcx. This means
 2850 that the evaluation of the last argument clobbers the first argument.
 2851 
 2852 To avoid this we use a rather awful hack: when producing code for a C call with
 2853 at least one non-trivial argument, we first evaluate all of the arguments into
 2854 local registers before moving them into their final calling-convention-defined
 2855 homes.  This is performed by 'evalArgs'. Here we define "non-trivial" to be an
 2856 expression which might contain a MachOp since these are the only cases which
 2857 might clobber registers. Furthermore, we use a conservative approximation of
 2858 this condition (only looking at the top-level of CmmExprs) to avoid spending
 2859 too much effort trying to decide whether we want to take the fast path.
 2860 
 2861 Note that this hack *also* applies to calls to out-of-line PrimTargets (which
 2862 are lowered via a C call) since outOfLineCmmOp produces the call via
 2863 (stmtToInstrs (CmmUnsafeForeignCall ...)), which will ultimately end up
 2864 back in genCCall{32,64}.
 2865 -}
 2866 
 2867 -- | See Note [Evaluate C-call arguments before placing in destination registers]
 2868 evalArgs :: BlockId -> [CmmActual] -> NatM (InstrBlock, [CmmActual])
 2869 evalArgs bid actuals
 2870   | any mightContainMachOp actuals = do
 2871       regs_blks <- mapM evalArg actuals
 2872       return (concatOL $ map fst regs_blks, map snd regs_blks)
 2873   | otherwise = return (nilOL, actuals)
 2874   where
 2875     mightContainMachOp (CmmReg _)      = False
 2876     mightContainMachOp (CmmRegOff _ _) = False
 2877     mightContainMachOp (CmmLit _)      = False
 2878     mightContainMachOp _               = True
 2879 
 2880     evalArg :: CmmActual -> NatM (InstrBlock, CmmExpr)
 2881     evalArg actual = do
 2882         platform <- getPlatform
 2883         lreg <- newLocalReg $ cmmExprType platform actual
 2884         (instrs, bid1) <- stmtToInstrs bid $ CmmAssign (CmmLocal lreg) actual
 2885         -- The above assignment shouldn't change the current block
 2886         massert (isNothing bid1)
 2887         return (instrs, CmmReg $ CmmLocal lreg)
 2888 
 2889     newLocalReg :: CmmType -> NatM LocalReg
 2890     newLocalReg ty = LocalReg <$> getUniqueM <*> pure ty
 2891 
 2892 -- Note [DIV/IDIV for bytes]
 2893 --
 2894 -- IDIV reminder:
 2895 --   Size    Dividend   Divisor   Quotient    Remainder
 2896 --   byte    %ax         r/m8      %al          %ah
 2897 --   word    %dx:%ax     r/m16     %ax          %dx
 2898 --   dword   %edx:%eax   r/m32     %eax         %edx
 2899 --   qword   %rdx:%rax   r/m64     %rax         %rdx
 2900 --
 2901 -- We do a special case for the byte division because the current
 2902 -- codegen doesn't deal well with accessing %ah register (also,
 2903 -- accessing %ah in 64-bit mode is complicated because it cannot be an
 2904 -- operand of many instructions). So we just widen operands to 16 bits
 2905 -- and get the results from %al, %dl. This is not optimal, but a few
 2906 -- register moves are probably not a huge deal when doing division.
 2907 
 2908 genCCall32' :: ForeignTarget            -- function to call
 2909             -> [CmmFormal]        -- where to put the result
 2910             -> [CmmActual]        -- arguments (of mixed type)
 2911             -> NatM InstrBlock
 2912 genCCall32' target dest_regs args = do
 2913         config <- getConfig
 2914         let platform = ncgPlatform config
 2915             prom_args = map (maybePromoteCArg platform W32) args
 2916 
 2917             -- If the size is smaller than the word, we widen things (see maybePromoteCArg)
 2918             arg_size_bytes :: CmmType -> Int
 2919             arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth platform))
 2920 
 2921             roundTo a x | x `mod` a == 0 = x
 2922                         | otherwise = x + a - (x `mod` a)
 2923 
 2924             push_arg :: CmmActual {-current argument-}
 2925                             -> NatM InstrBlock  -- code
 2926 
 2927             push_arg  arg -- we don't need the hints on x86
 2928               | isWord64 arg_ty = do
 2929                 ChildCode64 code r_lo <- iselExpr64 arg
 2930                 delta <- getDeltaNat
 2931                 setDeltaNat (delta - 8)
 2932                 let r_hi = getHiVRegFromLo r_lo
 2933                 return (       code `appOL`
 2934                                toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
 2935                                      PUSH II32 (OpReg r_lo), DELTA (delta - 8),
 2936                                      DELTA (delta-8)]
 2937                     )
 2938 
 2939               | isFloatType arg_ty = do
 2940                 (reg, code) <- getSomeReg arg
 2941                 delta <- getDeltaNat
 2942                 setDeltaNat (delta-size)
 2943                 return (code `appOL`
 2944                                 toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
 2945                                       DELTA (delta-size),
 2946                                       let addr = AddrBaseIndex (EABaseReg esp)
 2947                                                                 EAIndexNone
 2948                                                                 (ImmInt 0)
 2949                                           format = floatFormat (typeWidth arg_ty)
 2950                                       in
 2951 
 2952                                       -- assume SSE2
 2953                                        MOV format (OpReg reg) (OpAddr addr)
 2954 
 2955                                      ]
 2956                                )
 2957 
 2958               | otherwise = do
 2959                 -- Arguments can be smaller than 32-bit, but we still use @PUSH
 2960                 -- II32@ - the usual calling conventions expect integers to be
 2961                 -- 4-byte aligned.
 2962                 massert ((typeWidth arg_ty) <= W32)
 2963                 (operand, code) <- getOperand arg
 2964                 delta <- getDeltaNat
 2965                 setDeltaNat (delta-size)
 2966                 return (code `snocOL`
 2967                         PUSH II32 operand `snocOL`
 2968                         DELTA (delta-size))
 2969 
 2970               where
 2971                  arg_ty = cmmExprType platform arg
 2972                  size = arg_size_bytes arg_ty -- Byte size
 2973 
 2974         let
 2975             -- Align stack to 16n for calls, assuming a starting stack
 2976             -- alignment of 16n - word_size on procedure entry. Which we
 2977             -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
 2978             sizes               = map (arg_size_bytes . cmmExprType platform) (reverse args)
 2979             raw_arg_size        = sum sizes + platformWordSizeInBytes platform
 2980             arg_pad_size        = (roundTo 16 $ raw_arg_size) - raw_arg_size
 2981             tot_arg_size        = raw_arg_size + arg_pad_size - platformWordSizeInBytes platform
 2982 
 2983 
 2984         delta0 <- getDeltaNat
 2985         setDeltaNat (delta0 - arg_pad_size)
 2986 
 2987         push_codes <- mapM push_arg (reverse prom_args)
 2988         delta <- getDeltaNat
 2989         massert (delta == delta0 - tot_arg_size)
 2990 
 2991         -- deal with static vs dynamic call targets
 2992         (callinsns,cconv) <-
 2993           case target of
 2994             ForeignTarget (CmmLit (CmmLabel lbl)) conv
 2995                -> -- ToDo: stdcall arg sizes
 2996                   return (unitOL (CALL (Left fn_imm) []), conv)
 2997                where fn_imm = ImmCLbl lbl
 2998             ForeignTarget expr conv
 2999                -> do { (dyn_r, dyn_c) <- getSomeReg expr
 3000                      ; massert (isWord32 (cmmExprType platform expr))
 3001                      ; return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
 3002             PrimTarget _
 3003                 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
 3004                             ++ "probably because too many return values."
 3005 
 3006         let push_code
 3007                 | arg_pad_size /= 0
 3008                 = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
 3009                         DELTA (delta0 - arg_pad_size)]
 3010                   `appOL` concatOL push_codes
 3011                 | otherwise
 3012                 = concatOL push_codes
 3013 
 3014               -- Deallocate parameters after call for ccall;
 3015               -- but not for stdcall (callee does it)
 3016               --
 3017               -- We have to pop any stack padding we added
 3018               -- even if we are doing stdcall, though (#5052)
 3019             pop_size
 3020                | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size
 3021                | otherwise = tot_arg_size
 3022 
 3023             call = callinsns `appOL`
 3024                    toOL (
 3025                       (if pop_size==0 then [] else
 3026                        [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
 3027                       ++
 3028                       [DELTA delta0]
 3029                    )
 3030         setDeltaNat delta0
 3031 
 3032         let
 3033             -- assign the results, if necessary
 3034             assign_code []     = nilOL
 3035             assign_code [dest]
 3036               | isFloatType ty =
 3037                   -- we assume SSE2
 3038                   let tmp_amode = AddrBaseIndex (EABaseReg esp)
 3039                                                        EAIndexNone
 3040                                                        (ImmInt 0)
 3041                       fmt = floatFormat w
 3042                          in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
 3043                                    DELTA (delta0 - b),
 3044                                    X87Store fmt  tmp_amode,
 3045                                    -- X87Store only supported for the CDECL ABI
 3046                                    -- NB: This code will need to be
 3047                                    -- revisited once GHC does more work around
 3048                                    -- SIGFPE f
 3049                                    MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
 3050                                    ADD II32 (OpImm (ImmInt b)) (OpReg esp),
 3051                                    DELTA delta0]
 3052               | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
 3053                                         MOV II32 (OpReg edx) (OpReg r_dest_hi)]
 3054               | otherwise      = unitOL (MOV (intFormat w)
 3055                                              (OpReg eax)
 3056                                              (OpReg r_dest))
 3057               where
 3058                     ty = localRegType dest
 3059                     w  = typeWidth ty
 3060                     b  = widthInBytes w
 3061                     r_dest_hi = getHiVRegFromLo r_dest
 3062                     r_dest    = getRegisterReg platform (CmmLocal dest)
 3063             assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
 3064 
 3065         return (push_code `appOL`
 3066                 call `appOL`
 3067                 assign_code dest_regs)
 3068 
 3069 genCCall64' :: ForeignTarget      -- function to call
 3070             -> [CmmFormal]        -- where to put the result
 3071             -> [CmmActual]        -- arguments (of mixed type)
 3072             -> NatM InstrBlock
 3073 genCCall64' target dest_regs args = do
 3074     platform <- getPlatform
 3075     -- load up the register arguments
 3076     let prom_args = map (maybePromoteCArg platform W32) args
 3077 
 3078     let load_args :: [CmmExpr]
 3079                   -> [Reg]         -- int regs avail for args
 3080                   -> [Reg]         -- FP regs avail for args
 3081                   -> InstrBlock    -- code computing args
 3082                   -> InstrBlock    -- code assigning args to ABI regs
 3083                   -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
 3084         -- no more regs to use
 3085         load_args args [] [] code acode     =
 3086             return (args, [], [], code, acode)
 3087 
 3088         -- no more args to push
 3089         load_args [] aregs fregs code acode =
 3090             return ([], aregs, fregs, code, acode)
 3091 
 3092         load_args (arg : rest) aregs fregs code acode
 3093             | isFloatType arg_rep = case fregs of
 3094                  []     -> push_this_arg
 3095                  (r:rs) -> do
 3096                     (code',acode') <- reg_this_arg r
 3097                     load_args rest aregs rs code' acode'
 3098             | otherwise           = case aregs of
 3099                  []     -> push_this_arg
 3100                  (r:rs) -> do
 3101                     (code',acode') <- reg_this_arg r
 3102                     load_args rest rs fregs code' acode'
 3103             where
 3104 
 3105               -- put arg into the list of stack pushed args
 3106               push_this_arg = do
 3107                  (args',ars,frs,code',acode')
 3108                      <- load_args rest aregs fregs code acode
 3109                  return (arg:args', ars, frs, code', acode')
 3110 
 3111               -- pass the arg into the given register
 3112               reg_this_arg r
 3113                 -- "operand" args can be directly assigned into r
 3114                 | isOperand False arg = do
 3115                     arg_code <- getAnyReg arg
 3116                     return (code, (acode `appOL` arg_code r))
 3117                 -- The last non-operand arg can be directly assigned after its
 3118                 -- computation without going into a temporary register
 3119                 | all (isOperand False) rest = do
 3120                     arg_code   <- getAnyReg arg
 3121                     return (code `appOL` arg_code r,acode)
 3122 
 3123                 -- other args need to be computed beforehand to avoid clobbering
 3124                 -- previously assigned registers used to pass parameters (see
 3125                 -- #11792, #12614). They are assigned into temporary registers
 3126                 -- and get assigned to proper call ABI registers after they all
 3127                 -- have been computed.
 3128                 | otherwise     = do
 3129                     arg_code <- getAnyReg arg
 3130                     tmp      <- getNewRegNat arg_fmt
 3131                     let
 3132                       code'  = code `appOL` arg_code tmp
 3133                       acode' = acode `snocOL` reg2reg arg_fmt tmp r
 3134                     return (code',acode')
 3135 
 3136               arg_rep = cmmExprType platform arg
 3137               arg_fmt = cmmTypeFormat arg_rep
 3138 
 3139         load_args_win :: [CmmExpr]
 3140                       -> [Reg]        -- used int regs
 3141                       -> [Reg]        -- used FP regs
 3142                       -> [(Reg, Reg)] -- (int, FP) regs avail for args
 3143                       -> InstrBlock
 3144                       -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
 3145         load_args_win args usedInt usedFP [] code
 3146             = return (args, usedInt, usedFP, code, nilOL)
 3147             -- no more regs to use
 3148         load_args_win [] usedInt usedFP _ code
 3149             = return ([], usedInt, usedFP, code, nilOL)
 3150             -- no more args to push
 3151         load_args_win (arg : rest) usedInt usedFP
 3152                       ((ireg, freg) : regs) code
 3153             | isFloatType arg_rep = do
 3154                  arg_code <- getAnyReg arg
 3155                  load_args_win rest (ireg : usedInt) (freg : usedFP) regs
 3156                                (code `appOL`
 3157                                 arg_code freg `snocOL`
 3158                                 -- If we are calling a varargs function
 3159                                 -- then we need to define ireg as well
 3160                                 -- as freg
 3161                                 MOV II64 (OpReg freg) (OpReg ireg))
 3162             | otherwise = do
 3163                  arg_code <- getAnyReg arg
 3164                  load_args_win rest (ireg : usedInt) usedFP regs
 3165                                (code `appOL` arg_code ireg)
 3166             where
 3167               arg_rep = cmmExprType platform arg
 3168 
 3169         arg_size = 8 -- always, at the mo
 3170 
 3171         push_args [] code = return code
 3172         push_args (arg:rest) code
 3173            | isFloatType arg_rep = do
 3174              (arg_reg, arg_code) <- getSomeReg arg
 3175              delta <- getDeltaNat
 3176              setDeltaNat (delta-arg_size)
 3177              let code' = code `appOL` arg_code `appOL` toOL [
 3178                             SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
 3179                             DELTA (delta-arg_size),
 3180                             MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel platform 0))]
 3181              push_args rest code'
 3182 
 3183            | otherwise = do
 3184              -- Arguments can be smaller than 64-bit, but we still use @PUSH
 3185              -- II64@ - the usual calling conventions expect integers to be
 3186              -- 8-byte aligned.
 3187              massert (width <= W64)
 3188              (arg_op, arg_code) <- getOperand arg
 3189              delta <- getDeltaNat
 3190              setDeltaNat (delta-arg_size)
 3191              let code' = code `appOL` arg_code `appOL` toOL [
 3192                                     PUSH II64 arg_op,
 3193                                     DELTA (delta-arg_size)]
 3194              push_args rest code'
 3195             where
 3196               arg_rep = cmmExprType platform arg
 3197               width = typeWidth arg_rep
 3198 
 3199         leaveStackSpace n = do
 3200              delta <- getDeltaNat
 3201              setDeltaNat (delta - n * arg_size)
 3202              return $ toOL [
 3203                          SUB II64 (OpImm (ImmInt (n * platformWordSizeInBytes platform))) (OpReg rsp),
 3204                          DELTA (delta - n * arg_size)]
 3205 
 3206     (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
 3207          <-
 3208         if platformOS platform == OSMinGW32
 3209         then load_args_win prom_args [] [] (allArgRegs platform) nilOL
 3210         else do
 3211            (stack_args, aregs, fregs, load_args_code, assign_args_code)
 3212                <- load_args prom_args (allIntArgRegs platform)
 3213                                       (allFPArgRegs platform)
 3214                                       nilOL nilOL
 3215            let used_regs rs as = reverse (drop (length rs) (reverse as))
 3216                fregs_used      = used_regs fregs (allFPArgRegs platform)
 3217                aregs_used      = used_regs aregs (allIntArgRegs platform)
 3218            return (stack_args, aregs_used, fregs_used, load_args_code
 3219                                                       , assign_args_code)
 3220 
 3221     let
 3222         arg_regs_used = int_regs_used ++ fp_regs_used
 3223         arg_regs = [eax] ++ arg_regs_used
 3224                 -- for annotating the call instruction with
 3225         sse_regs = length fp_regs_used
 3226         arg_stack_slots = if platformOS platform == OSMinGW32
 3227                           then length stack_args + length (allArgRegs platform)
 3228                           else length stack_args
 3229         tot_arg_size = arg_size * arg_stack_slots
 3230 
 3231 
 3232     -- Align stack to 16n for calls, assuming a starting stack
 3233     -- alignment of 16n - word_size on procedure entry. Which we
 3234     -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86]
 3235     let word_size = platformWordSizeInBytes platform
 3236     (real_size, adjust_rsp) <-
 3237         if (tot_arg_size + word_size) `rem` 16 == 0
 3238             then return (tot_arg_size, nilOL)
 3239             else do -- we need to adjust...
 3240                 delta <- getDeltaNat
 3241                 setDeltaNat (delta - word_size)
 3242                 return (tot_arg_size + word_size, toOL [
 3243                                 SUB II64 (OpImm (ImmInt word_size)) (OpReg rsp),
 3244                                 DELTA (delta - word_size) ])
 3245 
 3246     -- push the stack args, right to left
 3247     push_code <- push_args (reverse stack_args) nilOL
 3248     -- On Win64, we also have to leave stack space for the arguments
 3249     -- that we are passing in registers
 3250     lss_code <- if platformOS platform == OSMinGW32
 3251                 then leaveStackSpace (length (allArgRegs platform))
 3252                 else return nilOL
 3253     delta <- getDeltaNat
 3254 
 3255     -- deal with static vs dynamic call targets
 3256     (callinsns,_cconv) <-
 3257       case target of
 3258         ForeignTarget (CmmLit (CmmLabel lbl)) conv
 3259            -> -- ToDo: stdcall arg sizes
 3260               return (unitOL (CALL (Left fn_imm) arg_regs), conv)
 3261            where fn_imm = ImmCLbl lbl
 3262         ForeignTarget expr conv
 3263            -> do (dyn_r, dyn_c) <- getSomeReg expr
 3264                  return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
 3265         PrimTarget _
 3266             -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
 3267                         ++ "probably because too many return values."
 3268 
 3269     let
 3270         -- The x86_64 ABI requires us to set %al to the number of SSE2
 3271         -- registers that contain arguments, if the called routine
 3272         -- is a varargs function.  We don't know whether it's a
 3273         -- varargs function or not, so we have to assume it is.
 3274         --
 3275         -- It's not safe to omit this assignment, even if the number
 3276         -- of SSE2 regs in use is zero.  If %al is larger than 8
 3277         -- on entry to a varargs function, seg faults ensue.
 3278         assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
 3279 
 3280     let call = callinsns `appOL`
 3281                toOL (
 3282                     -- Deallocate parameters after call for ccall;
 3283                     -- stdcall has callee do it, but is not supported on
 3284                     -- x86_64 target (see #3336)
 3285                   (if real_size==0 then [] else
 3286                    [ADD (intFormat (platformWordWidth platform)) (OpImm (ImmInt real_size)) (OpReg esp)])
 3287                   ++
 3288                   [DELTA (delta + real_size)]
 3289                )
 3290     setDeltaNat (delta + real_size)
 3291 
 3292     let
 3293         -- assign the results, if necessary
 3294         assign_code []     = nilOL
 3295         assign_code [dest] =
 3296           case typeWidth rep of
 3297                 W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
 3298                                                      (OpReg xmm0)
 3299                                                      (OpReg r_dest))
 3300                 W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
 3301                                                      (OpReg xmm0)
 3302                                                      (OpReg r_dest))
 3303                 _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
 3304           where
 3305                 rep = localRegType dest
 3306                 r_dest = getRegisterReg platform  (CmmLocal dest)
 3307         assign_code _many = panic "genCCall.assign_code many"
 3308 
 3309     return (adjust_rsp          `appOL`
 3310             push_code           `appOL`
 3311             load_args_code      `appOL`
 3312             assign_args_code    `appOL`
 3313             lss_code            `appOL`
 3314             assign_eax sse_regs `appOL`
 3315             call                `appOL`
 3316             assign_code dest_regs)
 3317 
 3318 
 3319 maybePromoteCArg :: Platform -> Width -> CmmExpr -> CmmExpr
 3320 maybePromoteCArg platform wto arg
 3321  | wfrom < wto = CmmMachOp (MO_UU_Conv wfrom wto) [arg]
 3322  | otherwise   = arg
 3323  where
 3324    wfrom = cmmExprWidth platform arg
 3325 
 3326 outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual]
 3327                -> NatM InstrBlock
 3328 outOfLineCmmOp bid mop res args
 3329   = do
 3330       config <- getConfig
 3331       targetExpr <- cmmMakeDynamicReference config CallReference lbl
 3332       let target = ForeignTarget targetExpr
 3333                            (ForeignConvention CCallConv [] [] CmmMayReturn)
 3334 
 3335       -- We know foreign calls results in no new basic blocks, so we can ignore
 3336       -- the returned block id.
 3337       (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
 3338       return instrs
 3339   where
 3340         -- Assume we can call these functions directly, and that they're not in a dynamic library.
 3341         -- TODO: Why is this ok? Under linux this code will be in libm.so
 3342         --       Is it because they're really implemented as a primitive instruction by the assembler??  -- BL 2009/12/31
 3343         lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
 3344 
 3345         fn = case mop of
 3346               MO_F32_Sqrt  -> fsLit "sqrtf"
 3347               MO_F32_Fabs  -> fsLit "fabsf"
 3348               MO_F32_Sin   -> fsLit "sinf"
 3349               MO_F32_Cos   -> fsLit "cosf"
 3350               MO_F32_Tan   -> fsLit "tanf"
 3351               MO_F32_Exp   -> fsLit "expf"
 3352               MO_F32_ExpM1 -> fsLit "expm1f"
 3353               MO_F32_Log   -> fsLit "logf"
 3354               MO_F32_Log1P -> fsLit "log1pf"
 3355 
 3356               MO_F32_Asin  -> fsLit "asinf"
 3357               MO_F32_Acos  -> fsLit "acosf"
 3358               MO_F32_Atan  -> fsLit "atanf"
 3359 
 3360               MO_F32_Sinh  -> fsLit "sinhf"
 3361               MO_F32_Cosh  -> fsLit "coshf"
 3362               MO_F32_Tanh  -> fsLit "tanhf"
 3363               MO_F32_Pwr   -> fsLit "powf"
 3364 
 3365               MO_F32_Asinh -> fsLit "asinhf"
 3366               MO_F32_Acosh -> fsLit "acoshf"
 3367               MO_F32_Atanh -> fsLit "atanhf"
 3368 
 3369               MO_F64_Sqrt  -> fsLit "sqrt"
 3370               MO_F64_Fabs  -> fsLit "fabs"
 3371               MO_F64_Sin   -> fsLit "sin"
 3372               MO_F64_Cos   -> fsLit "cos"
 3373               MO_F64_Tan   -> fsLit "tan"
 3374               MO_F64_Exp   -> fsLit "exp"
 3375               MO_F64_ExpM1 -> fsLit "expm1"
 3376               MO_F64_Log   -> fsLit "log"
 3377               MO_F64_Log1P -> fsLit "log1p"
 3378 
 3379               MO_F64_Asin  -> fsLit "asin"
 3380               MO_F64_Acos  -> fsLit "acos"
 3381               MO_F64_Atan  -> fsLit "atan"
 3382 
 3383               MO_F64_Sinh  -> fsLit "sinh"
 3384               MO_F64_Cosh  -> fsLit "cosh"
 3385               MO_F64_Tanh  -> fsLit "tanh"
 3386               MO_F64_Pwr   -> fsLit "pow"
 3387 
 3388               MO_F64_Asinh  -> fsLit "asinh"
 3389               MO_F64_Acosh  -> fsLit "acosh"
 3390               MO_F64_Atanh  -> fsLit "atanh"
 3391 
 3392               MO_I64_ToI   -> fsLit "hs_int64ToInt"
 3393               MO_I64_FromI -> fsLit "hs_intToInt64"
 3394               MO_W64_ToW   -> fsLit "hs_word64ToWord"
 3395               MO_W64_FromW -> fsLit "hs_wordToWord64"
 3396               MO_x64_Neg   -> fsLit "hs_neg64"
 3397               MO_x64_Add   -> fsLit "hs_add64"
 3398               MO_x64_Sub   -> fsLit "hs_sub64"
 3399               MO_x64_Mul   -> fsLit "hs_mul64"
 3400               MO_I64_Quot  -> fsLit "hs_quotInt64"
 3401               MO_I64_Rem   -> fsLit "hs_remInt64"
 3402               MO_W64_Quot  -> fsLit "hs_quotWord64"
 3403               MO_W64_Rem   -> fsLit "hs_remWord64"
 3404               MO_x64_And   -> fsLit "hs_and64"
 3405               MO_x64_Or    -> fsLit "hs_or64"
 3406               MO_x64_Xor   -> fsLit "hs_xor64"
 3407               MO_x64_Not   -> fsLit "hs_not64"
 3408               MO_x64_Shl   -> fsLit "hs_uncheckedShiftL64"
 3409               MO_I64_Shr   -> fsLit "hs_uncheckedIShiftRA64"
 3410               MO_W64_Shr   -> fsLit "hs_uncheckedShiftRL64"
 3411               MO_x64_Eq    -> fsLit "hs_eq64"
 3412               MO_x64_Ne    -> fsLit "hs_ne64"
 3413               MO_I64_Ge    -> fsLit "hs_geInt64"
 3414               MO_I64_Gt    -> fsLit "hs_gtInt64"
 3415               MO_I64_Le    -> fsLit "hs_leInt64"
 3416               MO_I64_Lt    -> fsLit "hs_ltInt64"
 3417               MO_W64_Ge    -> fsLit "hs_geWord64"
 3418               MO_W64_Gt    -> fsLit "hs_gtWord64"
 3419               MO_W64_Le    -> fsLit "hs_leWord64"
 3420               MO_W64_Lt    -> fsLit "hs_ltWord64"
 3421 
 3422               MO_Memcpy _  -> fsLit "memcpy"
 3423               MO_Memset _  -> fsLit "memset"
 3424               MO_Memmove _ -> fsLit "memmove"
 3425               MO_Memcmp _  -> fsLit "memcmp"
 3426 
 3427               MO_SuspendThread -> fsLit "suspendThread"
 3428               MO_ResumeThread  -> fsLit "resumeThread"
 3429 
 3430               MO_PopCnt _  -> fsLit "popcnt"
 3431               MO_BSwap _   -> fsLit "bswap"
 3432               {- Here the C implementation is used as there is no x86
 3433               instruction to reverse a word's bit order.
 3434               -}
 3435               MO_BRev w    -> bRevLabel w
 3436               MO_Clz w     -> clzLabel w
 3437               MO_Ctz _     -> unsupported
 3438 
 3439               MO_Pdep w    -> pdepLabel w
 3440               MO_Pext w    -> pextLabel w
 3441 
 3442               MO_AtomicRMW _ _ -> unsupported
 3443               MO_AtomicRead _  -> unsupported
 3444               MO_AtomicWrite _ -> unsupported
 3445               MO_Cmpxchg w     -> cmpxchgLabel w -- for W64 on 32-bit
 3446                                                  -- TODO: implement
 3447                                                  -- cmpxchg8b instr
 3448               MO_Xchg _        -> should_be_inline
 3449 
 3450               MO_UF_Conv _ -> unsupported
 3451 
 3452               MO_S_Mul2    {}  -> unsupported
 3453               MO_S_QuotRem {}  -> unsupported
 3454               MO_U_QuotRem {}  -> unsupported
 3455               MO_U_QuotRem2 {} -> unsupported
 3456               MO_Add2 {}       -> unsupported
 3457               MO_AddIntC {}    -> unsupported
 3458               MO_SubIntC {}    -> unsupported
 3459               MO_AddWordC {}   -> unsupported
 3460               MO_SubWordC {}   -> unsupported
 3461               MO_U_Mul2 {}     -> unsupported
 3462               MO_ReadBarrier   -> unsupported
 3463               MO_WriteBarrier  -> unsupported
 3464               MO_Touch         -> unsupported
 3465               (MO_Prefetch_Data _ ) -> unsupported
 3466         unsupported = panic ("outOfLineCmmOp: " ++ show mop
 3467                           ++ " not supported here")
 3468         -- If we generate a call for the given primop
 3469         -- something went wrong.
 3470         should_be_inline = panic ("outOfLineCmmOp: " ++ show mop
 3471                           ++ " should be handled inline")
 3472 
 3473 
 3474 -- -----------------------------------------------------------------------------
 3475 -- Generating a table-branch
 3476 
 3477 genSwitch :: CmmExpr -> SwitchTargets -> NatM InstrBlock
 3478 
 3479 genSwitch expr targets = do
 3480   config <- getConfig
 3481   let platform = ncgPlatform config
 3482       -- We widen to a native-width register because we cannot use arbitry sizes
 3483       -- in x86 addressing modes.
 3484       exprWidened = CmmMachOp
 3485         (MO_UU_Conv (cmmExprWidth platform expr)
 3486                     (platformWordWidth platform))
 3487         [expr]
 3488       indexExpr = cmmOffset platform exprWidened offset
 3489   if ncgPIC config
 3490   then do
 3491         (reg,e_code) <- getNonClobberedReg indexExpr
 3492            -- getNonClobberedReg because it needs to survive across t_code
 3493         lbl <- getNewLabelNat
 3494         let is32bit = target32Bit platform
 3495             os = platformOS platform
 3496             -- Might want to use .rodata.<function we're in> instead, but as
 3497             -- long as it's something unique it'll work out since the
 3498             -- references to the jump table are in the appropriate section.
 3499             rosection = case os of
 3500               -- on Mac OS X/x86_64, put the jump table in the text section to
 3501               -- work around a limitation of the linker.
 3502               -- ld64 is unable to handle the relocations for
 3503               --     .quad L1 - L0
 3504               -- if L0 is not preceded by a non-anonymous label in its section.
 3505               OSDarwin | not is32bit -> Section Text lbl
 3506               _ -> Section ReadOnlyData lbl
 3507         dynRef <- cmmMakeDynamicReference config DataReference lbl
 3508         (tableReg,t_code) <- getSomeReg $ dynRef
 3509         let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
 3510                                        (EAIndex reg (platformWordSizeInBytes platform)) (ImmInt 0))
 3511 
 3512         offsetReg <- getNewRegNat (intFormat (platformWordWidth platform))
 3513         return $ if is32bit || os == OSDarwin
 3514                  then e_code `appOL` t_code `appOL` toOL [
 3515                                 ADD (intFormat (platformWordWidth platform)) op (OpReg tableReg),
 3516                                 JMP_TBL (OpReg tableReg) ids rosection lbl
 3517                        ]
 3518                  else -- HACK: On x86_64 binutils<2.17 is only able to generate
 3519                       -- PC32 relocations, hence we only get 32-bit offsets in
 3520                       -- the jump table. As these offsets are always negative
 3521                       -- we need to properly sign extend them to 64-bit. This
 3522                       -- hack should be removed in conjunction with the hack in
 3523                       -- PprMach.hs/pprDataItem once binutils 2.17 is standard.
 3524                       e_code `appOL` t_code `appOL` toOL [
 3525                                MOVSxL II32 op (OpReg offsetReg),
 3526                                ADD (intFormat (platformWordWidth platform))
 3527                                    (OpReg offsetReg)
 3528                                    (OpReg tableReg),
 3529                                JMP_TBL (OpReg tableReg) ids rosection lbl
 3530                        ]
 3531   else do
 3532         (reg,e_code) <- getSomeReg indexExpr
 3533         lbl <- getNewLabelNat
 3534         let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (platformWordSizeInBytes platform)) (ImmCLbl lbl))
 3535             code = e_code `appOL` toOL [
 3536                     JMP_TBL op ids (Section ReadOnlyData lbl) lbl
 3537                  ]
 3538         return code
 3539   where
 3540     (offset, blockIds) = switchTargetsToTable targets
 3541     ids = map (fmap DestBlockId) blockIds
 3542 
 3543 generateJumpTableForInstr :: NCGConfig -> Instr -> Maybe (NatCmmDecl (Alignment, RawCmmStatics) Instr)
 3544 generateJumpTableForInstr config (JMP_TBL _ ids section lbl)
 3545     = let getBlockId (DestBlockId id) = id
 3546           getBlockId _ = panic "Non-Label target in Jump Table"
 3547           blockIds = map (fmap getBlockId) ids
 3548       in Just (createJumpTable config blockIds section lbl)
 3549 generateJumpTableForInstr _ _ = Nothing
 3550 
 3551 createJumpTable :: NCGConfig -> [Maybe BlockId] -> Section -> CLabel
 3552                 -> GenCmmDecl (Alignment, RawCmmStatics) h g
 3553 createJumpTable config ids section lbl
 3554     = let jumpTable
 3555             | ncgPIC config =
 3556                   let ww = ncgWordWidth config
 3557                       jumpTableEntryRel Nothing
 3558                           = CmmStaticLit (CmmInt 0 ww)
 3559                       jumpTableEntryRel (Just blockid)
 3560                           = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww)
 3561                           where blockLabel = blockLbl blockid
 3562                   in map jumpTableEntryRel ids
 3563             | otherwise = map (jumpTableEntry config) ids
 3564       in CmmData section (mkAlignment 1, CmmStaticsRaw lbl jumpTable)
 3565 
 3566 extractUnwindPoints :: [Instr] -> [UnwindPoint]
 3567 extractUnwindPoints instrs =
 3568     [ UnwindPoint lbl unwinds | UNWIND lbl unwinds <- instrs]
 3569 
 3570 -- -----------------------------------------------------------------------------
 3571 -- 'condIntReg' and 'condFltReg': condition codes into registers
 3572 
 3573 -- Turn those condition codes into integers now (when they appear on
 3574 -- the right hand side of an assignment).
 3575 --
 3576 -- (If applicable) Do not fill the delay slots here; you will confuse the
 3577 -- register allocator.
 3578 
 3579 condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
 3580 
 3581 condIntReg cond x y = do
 3582   CondCode _ cond cond_code <- condIntCode cond x y
 3583   tmp <- getNewRegNat II8
 3584   let
 3585         code dst = cond_code `appOL` toOL [
 3586                     SETCC cond (OpReg tmp),
 3587                     MOVZxL II8 (OpReg tmp) (OpReg dst)
 3588                   ]
 3589   return (Any II32 code)
 3590 
 3591 
 3592 -----------------------------------------------------------
 3593 ---          Note [SSE Parity Checks]                   ---
 3594 -----------------------------------------------------------
 3595 
 3596 -- We have to worry about unordered operands (eg. comparisons
 3597 -- against NaN).  If the operands are unordered, the comparison
 3598 -- sets the parity flag, carry flag and zero flag.
 3599 -- All comparisons are supposed to return false for unordered
 3600 -- operands except for !=, which returns true.
 3601 --
 3602 -- Optimisation: we don't have to test the parity flag if we
 3603 -- know the test has already excluded the unordered case: eg >
 3604 -- and >= test for a zero carry flag, which can only occur for
 3605 -- ordered operands.
 3606 --
 3607 -- By reversing comparisons we can avoid testing the parity
 3608 -- for < and <= as well. If any of the arguments is an NaN we
 3609 -- return false either way. If both arguments are valid then
 3610 -- x <= y  <->  y >= x  holds. So it's safe to swap these.
 3611 --
 3612 -- We invert the condition inside getRegister'and  getCondCode
 3613 -- which should cover all invertable cases.
 3614 -- All other functions translating FP comparisons to assembly
 3615 -- use these to two generate the comparison code.
 3616 --
 3617 -- As an example consider a simple check:
 3618 --
 3619 -- func :: Float -> Float -> Int
 3620 -- func x y = if x < y then 1 else 0
 3621 --
 3622 -- Which in Cmm gives the floating point comparison.
 3623 --
 3624 --  if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf;
 3625 --
 3626 -- We used to compile this to an assembly code block like this:
 3627 -- _c2gh:
 3628 --  ucomiss %xmm2,%xmm1
 3629 --  jp _c2gf
 3630 --  jb _c2gg
 3631 --  jmp _c2gf
 3632 --
 3633 -- Where we have to introduce an explicit
 3634 -- check for unordered results (using jmp parity):
 3635 --
 3636 -- We can avoid this by exchanging the arguments and inverting the direction
 3637 -- of the comparison. This results in the sequence of:
 3638 --
 3639 --  ucomiss %xmm1,%xmm2
 3640 --  ja _c2g2
 3641 --  jmp _c2g1
 3642 --
 3643 -- Removing the jump reduces the pressure on the branch predidiction system
 3644 -- and plays better with the uOP cache.
 3645 
 3646 condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
 3647 condFltReg is32Bit cond x y = condFltReg_sse2
 3648  where
 3649 
 3650 
 3651   condFltReg_sse2 = do
 3652     CondCode _ cond cond_code <- condFltCode cond x y
 3653     tmp1 <- getNewRegNat (archWordFormat is32Bit)
 3654     tmp2 <- getNewRegNat (archWordFormat is32Bit)
 3655     let -- See Note [SSE Parity Checks]
 3656         code dst =
 3657            cond_code `appOL`
 3658              (case cond of
 3659                 NE  -> or_unordered dst
 3660                 GU  -> plain_test   dst
 3661                 GEU -> plain_test   dst
 3662                 -- Use ASSERT so we don't break releases if these creep in.
 3663                 LTT -> assertPpr False (ppr "Should have been turned into >") $
 3664                        and_ordered  dst
 3665                 LE  -> assertPpr False (ppr "Should have been turned into >=") $
 3666                        and_ordered  dst
 3667                 _   -> and_ordered  dst)
 3668 
 3669         plain_test dst = toOL [
 3670                     SETCC cond (OpReg tmp1),
 3671                     MOVZxL II8 (OpReg tmp1) (OpReg dst)
 3672                  ]
 3673         or_unordered dst = toOL [
 3674                     SETCC cond (OpReg tmp1),
 3675                     SETCC PARITY (OpReg tmp2),
 3676                     OR II8 (OpReg tmp1) (OpReg tmp2),
 3677                     MOVZxL II8 (OpReg tmp2) (OpReg dst)
 3678                   ]
 3679         and_ordered dst = toOL [
 3680                     SETCC cond (OpReg tmp1),
 3681                     SETCC NOTPARITY (OpReg tmp2),
 3682                     AND II8 (OpReg tmp1) (OpReg tmp2),
 3683                     MOVZxL II8 (OpReg tmp2) (OpReg dst)
 3684                   ]
 3685     return (Any II32 code)
 3686 
 3687 
 3688 -- -----------------------------------------------------------------------------
 3689 -- 'trivial*Code': deal with trivial instructions
 3690 
 3691 -- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
 3692 -- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
 3693 -- Only look for constants on the right hand side, because that's
 3694 -- where the generic optimizer will have put them.
 3695 
 3696 -- Similarly, for unary instructions, we don't have to worry about
 3697 -- matching an StInt as the argument, because genericOpt will already
 3698 -- have handled the constant-folding.
 3699 
 3700 
 3701 {-
 3702 The Rules of the Game are:
 3703 
 3704 * You cannot assume anything about the destination register dst;
 3705   it may be anything, including a fixed reg.
 3706 
 3707 * You may compute an operand into a fixed reg, but you may not
 3708   subsequently change the contents of that fixed reg.  If you
 3709   want to do so, first copy the value either to a temporary
 3710   or into dst.  You are free to modify dst even if it happens
 3711   to be a fixed reg -- that's not your problem.
 3712 
 3713 * You cannot assume that a fixed reg will stay live over an
 3714   arbitrary computation.  The same applies to the dst reg.
 3715 
 3716 * Temporary regs obtained from getNewRegNat are distinct from
 3717   each other and from all other regs, and stay live over
 3718   arbitrary computations.
 3719 
 3720 --------------------
 3721 
 3722 SDM's version of The Rules:
 3723 
 3724 * If getRegister returns Any, that means it can generate correct
 3725   code which places the result in any register, period.  Even if that
 3726   register happens to be read during the computation.
 3727 
 3728   Corollary #1: this means that if you are generating code for an
 3729   operation with two arbitrary operands, you cannot assign the result
 3730   of the first operand into the destination register before computing
 3731   the second operand.  The second operand might require the old value
 3732   of the destination register.
 3733 
 3734   Corollary #2: A function might be able to generate more efficient
 3735   code if it knows the destination register is a new temporary (and
 3736   therefore not read by any of the sub-computations).
 3737 
 3738 * If getRegister returns Any, then the code it generates may modify only:
 3739         (a) fresh temporaries
 3740         (b) the destination register
 3741         (c) known registers (eg. %ecx is used by shifts)
 3742   In particular, it may *not* modify global registers, unless the global
 3743   register happens to be the destination register.
 3744 -}
 3745 
 3746 trivialCode :: Width -> (Operand -> Operand -> Instr)
 3747             -> Maybe (Operand -> Operand -> Instr)
 3748             -> CmmExpr -> CmmExpr -> NatM Register
 3749 trivialCode width instr m a b
 3750     = do is32Bit <- is32BitPlatform
 3751          trivialCode' is32Bit width instr m a b
 3752 
 3753 trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr)
 3754              -> Maybe (Operand -> Operand -> Instr)
 3755              -> CmmExpr -> CmmExpr -> NatM Register
 3756 trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b
 3757   | is32BitLit is32Bit lit_a = do
 3758   b_code <- getAnyReg b
 3759   let
 3760        code dst
 3761          = b_code dst `snocOL`
 3762            revinstr (OpImm (litToImm lit_a)) (OpReg dst)
 3763   return (Any (intFormat width) code)
 3764 
 3765 trivialCode' _ width instr _ a b
 3766   = genTrivialCode (intFormat width) instr a b
 3767 
 3768 -- This is re-used for floating pt instructions too.
 3769 genTrivialCode :: Format -> (Operand -> Operand -> Instr)
 3770                -> CmmExpr -> CmmExpr -> NatM Register
 3771 genTrivialCode rep instr a b = do
 3772   (b_op, b_code) <- getNonClobberedOperand b
 3773   a_code <- getAnyReg a
 3774   tmp <- getNewRegNat rep
 3775   let
 3776      -- We want the value of b to stay alive across the computation of a.
 3777      -- But, we want to calculate a straight into the destination register,
 3778      -- because the instruction only has two operands (dst := dst `op` src).
 3779      -- The troublesome case is when the result of b is in the same register
 3780      -- as the destination reg.  In this case, we have to save b in a
 3781      -- new temporary across the computation of a.
 3782      code dst
 3783         | dst `regClashesWithOp` b_op =
 3784                 b_code `appOL`
 3785                 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
 3786                 a_code dst `snocOL`
 3787                 instr (OpReg tmp) (OpReg dst)
 3788         | otherwise =
 3789                 b_code `appOL`
 3790                 a_code dst `snocOL`
 3791                 instr b_op (OpReg dst)
 3792   return (Any rep code)
 3793 
 3794 regClashesWithOp :: Reg -> Operand -> Bool
 3795 reg `regClashesWithOp` OpReg reg2   = reg == reg2
 3796 reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
 3797 _   `regClashesWithOp` _            = False
 3798 
 3799 -----------
 3800 
 3801 trivialUCode :: Format -> (Operand -> Instr)
 3802              -> CmmExpr -> NatM Register
 3803 trivialUCode rep instr x = do
 3804   x_code <- getAnyReg x
 3805   let
 3806      code dst =
 3807         x_code dst `snocOL`
 3808         instr (OpReg dst)
 3809   return (Any rep code)
 3810 
 3811 -----------
 3812 
 3813 
 3814 trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
 3815                   -> CmmExpr -> CmmExpr -> NatM Register
 3816 trivialFCode_sse2 pk instr x y
 3817     = genTrivialCode format (instr format) x y
 3818     where format = floatFormat pk
 3819 
 3820 
 3821 trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
 3822 trivialUFCode format instr x = do
 3823   (x_reg, x_code) <- getSomeReg x
 3824   let
 3825      code dst =
 3826         x_code `snocOL`
 3827         instr x_reg dst
 3828   return (Any format code)
 3829 
 3830 
 3831 --------------------------------------------------------------------------------
 3832 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
 3833 coerceInt2FP from to x =  coerce_sse2
 3834  where
 3835 
 3836    coerce_sse2 = do
 3837      (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
 3838      let
 3839            opc  = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
 3840                              n -> panic $ "coerceInt2FP.sse: unhandled width ("
 3841                                          ++ show n ++ ")"
 3842            code dst = x_code `snocOL` opc (intFormat from) x_op dst
 3843      return (Any (floatFormat to) code)
 3844         -- works even if the destination rep is <II32
 3845 
 3846 --------------------------------------------------------------------------------
 3847 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
 3848 coerceFP2Int from to x =  coerceFP2Int_sse2
 3849  where
 3850    coerceFP2Int_sse2 = do
 3851      (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
 3852      let
 3853            opc  = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
 3854                                n -> panic $ "coerceFP2Init.sse: unhandled width ("
 3855                                            ++ show n ++ ")"
 3856            code dst = x_code `snocOL` opc (intFormat to) x_op dst
 3857      return (Any (intFormat to) code)
 3858          -- works even if the destination rep is <II32
 3859 
 3860 
 3861 --------------------------------------------------------------------------------
 3862 coerceFP2FP :: Width -> CmmExpr -> NatM Register
 3863 coerceFP2FP to x = do
 3864   (x_reg, x_code) <- getSomeReg x
 3865   let
 3866         opc  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
 3867                                      n -> panic $ "coerceFP2FP: unhandled width ("
 3868                                                  ++ show n ++ ")"
 3869         code dst = x_code `snocOL` opc x_reg dst
 3870   return (Any ( floatFormat to) code)
 3871 
 3872 --------------------------------------------------------------------------------
 3873 
 3874 sse2NegCode :: Width -> CmmExpr -> NatM Register
 3875 sse2NegCode w x = do
 3876   let fmt = floatFormat w
 3877   x_code <- getAnyReg x
 3878   -- This is how gcc does it, so it can't be that bad:
 3879   let
 3880     const = case fmt of
 3881       FF32 -> CmmInt 0x80000000 W32
 3882       FF64 -> CmmInt 0x8000000000000000 W64
 3883       x@II8  -> wrongFmt x
 3884       x@II16 -> wrongFmt x
 3885       x@II32 -> wrongFmt x
 3886       x@II64 -> wrongFmt x
 3887 
 3888       where
 3889         wrongFmt x = panic $ "sse2NegCode: " ++ show x
 3890   Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
 3891   tmp <- getNewRegNat fmt
 3892   let
 3893     code dst = x_code dst `appOL` amode_code `appOL` toOL [
 3894         MOV fmt (OpAddr amode) (OpReg tmp),
 3895         XOR fmt (OpReg tmp) (OpReg dst)
 3896         ]
 3897   --
 3898   return (Any fmt code)
 3899 
 3900 isVecExpr :: CmmExpr -> Bool
 3901 isVecExpr (CmmMachOp (MO_V_Insert {}) _)   = True
 3902 isVecExpr (CmmMachOp (MO_V_Extract {}) _)  = True
 3903 isVecExpr (CmmMachOp (MO_V_Add {}) _)      = True
 3904 isVecExpr (CmmMachOp (MO_V_Sub {}) _)      = True
 3905 isVecExpr (CmmMachOp (MO_V_Mul {}) _)      = True
 3906 isVecExpr (CmmMachOp (MO_VS_Quot {}) _)    = True
 3907 isVecExpr (CmmMachOp (MO_VS_Rem {}) _)     = True
 3908 isVecExpr (CmmMachOp (MO_VS_Neg {}) _)     = True
 3909 isVecExpr (CmmMachOp (MO_VF_Insert {}) _)  = True
 3910 isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
 3911 isVecExpr (CmmMachOp (MO_VF_Add {}) _)     = True
 3912 isVecExpr (CmmMachOp (MO_VF_Sub {}) _)     = True
 3913 isVecExpr (CmmMachOp (MO_VF_Mul {}) _)     = True
 3914 isVecExpr (CmmMachOp (MO_VF_Quot {}) _)    = True
 3915 isVecExpr (CmmMachOp (MO_VF_Neg {}) _)     = True
 3916 isVecExpr (CmmMachOp _ [e])                = isVecExpr e
 3917 isVecExpr _                                = False
 3918 
 3919 needLlvm :: NatM a
 3920 needLlvm =
 3921     sorry $ unlines ["The native code generator does not support vector"
 3922                     ,"instructions. Please use -fllvm."]
 3923 
 3924 -- | This works on the invariant that all jumps in the given blocks are required.
 3925 --   Starting from there we try to make a few more jumps redundant by reordering
 3926 --   them.
 3927 --   We depend on the information in the CFG to do so so without a given CFG
 3928 --   we do nothing.
 3929 invertCondBranches :: Maybe CFG  -- ^ CFG if present
 3930                    -> LabelMap a -- ^ Blocks with info tables
 3931                    -> [NatBasicBlock Instr] -- ^ List of basic blocks
 3932                    -> [NatBasicBlock Instr]
 3933 invertCondBranches Nothing _       bs = bs
 3934 invertCondBranches (Just cfg) keep bs =
 3935     invert bs
 3936   where
 3937     invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
 3938     invert ((BasicBlock lbl1 ins@(_:_:_xs)):b2@(BasicBlock lbl2 _):bs)
 3939       | --pprTrace "Block" (ppr lbl1) True,
 3940         (jmp1,jmp2) <- last2 ins
 3941       , JXX cond1 target1 <- jmp1
 3942       , target1 == lbl2
 3943       --, pprTrace "CutChance" (ppr b1) True
 3944       , JXX ALWAYS target2 <- jmp2
 3945       -- We have enough information to check if we can perform the inversion
 3946       -- TODO: We could also check for the last asm instruction which sets
 3947       -- status flags instead. Which I suspect is worse in terms of compiler
 3948       -- performance, but might be applicable to more cases
 3949       , Just edgeInfo1 <- getEdgeInfo lbl1 target1 cfg
 3950       , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
 3951       -- Both jumps come from the same cmm statement
 3952       , transitionSource edgeInfo1 == transitionSource edgeInfo2
 3953       , CmmSource {trans_cmmNode = cmmCondBranch} <- transitionSource edgeInfo1
 3954 
 3955       --Int comparisons are invertable
 3956       , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
 3957       , Just _ <- maybeIntComparison op
 3958       , Just invCond <- maybeInvertCond cond1
 3959 
 3960       --Swap the last two jumps, invert the conditional jumps condition.
 3961       = let jumps =
 3962               case () of
 3963                 -- We are free the eliminate the jmp. So we do so.
 3964                 _ | not (mapMember target1 keep)
 3965                     -> [JXX invCond target2]
 3966                 -- If the conditional target is unlikely we put the other
 3967                 -- target at the front.
 3968                   | edgeWeight edgeInfo2 > edgeWeight edgeInfo1
 3969                     -> [JXX invCond target2, JXX ALWAYS target1]
 3970                 -- Keep things as-is otherwise
 3971                   | otherwise
 3972                     -> [jmp1, jmp2]
 3973         in --pprTrace "Cutable" (ppr [jmp1,jmp2] <+> text "=>" <+> ppr jumps) $
 3974            (BasicBlock lbl1
 3975             (dropTail 2 ins ++ jumps))
 3976             : invert (b2:bs)
 3977     invert (b:bs) = b : invert bs
 3978     invert [] = []