never executed always true always false
    1 {-# language GADTs #-}
    2 {-# LANGUAGE TupleSections #-}
    3 {-# LANGUAGE BangPatterns #-}
    4 {-# LANGUAGE BinaryLiterals #-}
    5 {-# LANGUAGE OverloadedStrings #-}
    6 {-# LANGUAGE NumericUnderscores #-}
    7 module GHC.CmmToAsm.AArch64.CodeGen (
    8       cmmTopCodeGen
    9     , generateJumpTableForInstr
   10 )
   11 
   12 where
   13 
   14 -- NCG stuff:
   15 import GHC.Prelude hiding (EQ)
   16 
   17 import GHC.Platform.Regs
   18 import GHC.CmmToAsm.AArch64.Instr
   19 import GHC.CmmToAsm.AArch64.Regs
   20 import GHC.CmmToAsm.AArch64.Cond
   21 
   22 import GHC.CmmToAsm.CPrim
   23 import GHC.Cmm.DebugBlock
   24 import GHC.CmmToAsm.Monad
   25    ( NatM, getNewRegNat
   26    , getPicBaseMaybeNat, getPlatform, getConfig
   27    , getDebugBlock, getFileId
   28    )
   29 -- import GHC.CmmToAsm.Instr
   30 import GHC.CmmToAsm.PIC
   31 import GHC.CmmToAsm.Format
   32 import GHC.CmmToAsm.Config
   33 import GHC.CmmToAsm.Types
   34 import GHC.Platform.Reg
   35 import GHC.Platform
   36 
   37 -- Our intermediate code:
   38 import GHC.Cmm.BlockId
   39 import GHC.Cmm
   40 import GHC.Cmm.Utils
   41 import GHC.Cmm.Switch
   42 import GHC.Cmm.CLabel
   43 import GHC.Cmm.Dataflow.Block
   44 import GHC.Cmm.Dataflow.Graph
   45 import GHC.Types.Tickish ( GenTickish(..) )
   46 import GHC.Types.SrcLoc  ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
   47 
   48 -- The rest:
   49 import GHC.Data.OrdList
   50 import GHC.Utils.Outputable
   51 
   52 import Control.Monad    ( mapAndUnzipM, when, foldM )
   53 import Data.Word
   54 import Data.Maybe
   55 import GHC.Float
   56 
   57 import GHC.Types.Basic
   58 import GHC.Types.ForeignCall
   59 import GHC.Data.FastString
   60 import GHC.Utils.Misc
   61 import GHC.Utils.Panic
   62 
   63 -- Note [General layout of an NCG]
   64 -- @cmmTopCodeGen@ will be our main entry point to code gen.  Here we'll get
   65 -- @RawCmmDecl@; see GHC.Cmm
   66 --
   67 --   RawCmmDecl = GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) CmmGraph
   68 --
   69 --   GenCmmDecl d h g = CmmProc h CLabel [GlobalReg] g
   70 --                    | CmmData Section d
   71 --
   72 -- As a result we want to transform this to a list of @NatCmmDecl@, which is
   73 -- defined @GHC.CmmToAsm.Instr@ as
   74 --
   75 --   type NatCmmDecl statics instr
   76 --        = GenCmmDecl statics (LabelMap RawCmmStatics) (ListGraph instr)
   77 --
   78 -- Thus well' turn
   79 --   GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) CmmGraph
   80 -- into
   81 --   [GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) (ListGraph Instr)]
   82 --
   83 -- where @CmmGraph@ is
   84 --
   85 --   type CmmGraph = GenCmmGraph CmmNode
   86 --   data GenCmmGraph n = CmmGraph { g_entry :: BlockId, g_graph :: Graph n C C }
   87 --   type CmmBlock = Block CmmNode C C
   88 --
   89 -- and @ListGraph Instr@ is
   90 --
   91 --   newtype ListGraph i = ListGraph [GenBasicBlock i]
   92 --   data GenBasicBlock i = BasicBlock BlockId [i]
   93 
   94 cmmTopCodeGen
   95     :: RawCmmDecl
   96     -> NatM [NatCmmDecl RawCmmStatics Instr]
   97 
   98 -- Thus we'll have to deal with either CmmProc ...
   99 cmmTopCodeGen _cmm@(CmmProc info lab live graph) = do
  100   -- do
  101   --   traceM $ "-- -------------------------- cmmTopGen (CmmProc) -------------------------- --\n"
  102   --         ++ showSDocUnsafe (ppr cmm)
  103 
  104   let blocks = toBlockListEntryFirst graph
  105   (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
  106   picBaseMb <- getPicBaseMaybeNat
  107 
  108   let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
  109       tops = proc : concat statics
  110 
  111   case picBaseMb of
  112       Just _picBase -> panic "AArch64.cmmTopCodeGen: picBase not implemented"
  113       Nothing -> return tops
  114 
  115 -- ... or CmmData.
  116 cmmTopCodeGen _cmm@(CmmData sec dat) = do
  117   -- do
  118   --   traceM $ "-- -------------------------- cmmTopGen (CmmData) -------------------------- --\n"
  119   --         ++ showSDocUnsafe (ppr cmm)
  120   return [CmmData sec dat] -- no translation, we just use CmmStatic
  121 
  122 basicBlockCodeGen
  123         :: Block CmmNode C C
  124         -> NatM ( [NatBasicBlock Instr]
  125                 , [NatCmmDecl RawCmmStatics Instr])
  126 
  127 basicBlockCodeGen block = do
  128   config <- getConfig
  129   -- do
  130   --   traceM $ "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
  131   --         ++ showSDocUnsafe (ppr block)
  132   let (_, nodes, tail)  = blockSplit block
  133       id = entryLabel block
  134       stmts = blockToList nodes
  135 
  136       header_comment_instr = unitOL $ MULTILINE_COMMENT (
  137           text "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
  138           $+$ pdoc (ncgPlatform config) block
  139           )
  140   -- Generate location directive
  141   dbg <- getDebugBlock (entryLabel block)
  142   loc_instrs <- case dblSourceTick =<< dbg of
  143     Just (SourceNote span name)
  144       -> do fileId <- getFileId (srcSpanFile span)
  145             let line = srcSpanStartLine span; col = srcSpanStartCol span
  146             return $ unitOL $ LOCATION fileId line col name
  147     _ -> return nilOL
  148   (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
  149   (!tail_instrs,_) <- stmtToInstrs mid_bid tail
  150   let instrs = header_comment_instr `appOL` loc_instrs `appOL` mid_instrs `appOL` tail_instrs
  151   -- TODO: Then x86 backend run @verifyBasicBlock@ here and inserts
  152   --      unwinding info. See Ticket 19913
  153   -- code generation may introduce new basic block boundaries, which
  154   -- are indicated by the NEWBLOCK instruction.  We must split up the
  155   -- instruction stream into basic blocks again.  Also, we extract
  156   -- LDATAs here too.
  157   let
  158         (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs
  159 
  160         mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
  161           = ([], BasicBlock id instrs : blocks, statics)
  162         mkBlocks (LDATA sec dat) (instrs,blocks,statics)
  163           = (instrs, blocks, CmmData sec dat:statics)
  164         mkBlocks instr (instrs,blocks,statics)
  165           = (instr:instrs, blocks, statics)
  166   return (BasicBlock id top : other_blocks, statics)
  167 
  168 
  169 -- -----------------------------------------------------------------------------
  170 -- | Utilities
  171 ann :: SDoc -> Instr -> Instr
  172 ann doc instr {- | debugIsOn -} = ANN doc instr
  173 -- ann _ instr = instr
  174 {-# INLINE ann #-}
  175 
  176 -- Using pprExpr will hide the AST, @ANN@ will end up in the assembly with
  177 -- -dppr-debug.  The idea is that we can trivially see how a cmm expression
  178 -- ended up producing the assmebly we see.  By having the verbatim AST printed
  179 -- we can simply check the patterns that were matched to arrive at the assmebly
  180 -- we generated.
  181 --
  182 -- pprExpr will hide a lot of noise of the underlying data structure and print
  183 -- the expression into something that can be easily read by a human. However
  184 -- going back to the exact CmmExpr representation can be labourous and adds
  185 -- indirections to find the matches that lead to the assembly.
  186 --
  187 -- An improvement oculd be to have
  188 --
  189 --    (pprExpr genericPlatform e) <> parens (text. show e)
  190 --
  191 -- to have the best of both worlds.
  192 --
  193 -- Note: debugIsOn is too restrictive, it only works for debug compilers.
  194 -- However, we do not only want to inspect this for debug compilers. Ideally
  195 -- we'd have a check for -dppr-debug here already, such that we don't even
  196 -- generate the ANN expressions. However, as they are lazy, they shouldn't be
  197 -- forced until we actually force them, and without -dppr-debug they should
  198 -- never end up being forced.
  199 annExpr :: CmmExpr -> Instr -> Instr
  200 annExpr e instr {- | debugIsOn -} = ANN (text . show $ e) instr
  201 -- annExpr e instr {- | debugIsOn -} = ANN (pprExpr genericPlatform e) instr
  202 -- annExpr _ instr = instr
  203 {-# INLINE annExpr #-}
  204 
  205 -- -----------------------------------------------------------------------------
  206 -- Generating a table-branch
  207 
  208 -- TODO jump tables would be a lot faster, but we'll use bare bones for now.
  209 -- this is usually done by sticking the jump table ids into an instruction
  210 -- and then have the @generateJumpTableForInstr@ callback produce the jump
  211 -- table as a static.
  212 --
  213 -- See Ticket 19912
  214 --
  215 -- data SwitchTargets =
  216 --    SwitchTargets
  217 --        Bool                       -- Signed values
  218 --        (Integer, Integer)         -- Range
  219 --        (Maybe Label)              -- Default value
  220 --        (M.Map Integer Label)      -- The branches
  221 --
  222 -- Non Jumptable plan:
  223 -- xE <- expr
  224 --
  225 genSwitch :: CmmExpr -> SwitchTargets -> NatM InstrBlock
  226 genSwitch expr targets = do -- pprPanic "genSwitch" (ppr expr)
  227   (reg, format, code) <- getSomeReg expr
  228   let w = formatToWidth format
  229   let mkbranch acc (key, bid) = do
  230         (keyReg, _format, code) <- getSomeReg (CmmLit (CmmInt key w))
  231         return $ code `appOL`
  232                  toOL [ CMP (OpReg w reg) (OpReg w keyReg)
  233                       , BCOND EQ (TBlock bid)
  234                       ] `appOL` acc
  235       def_code = case switchTargetsDefault targets of
  236         Just bid -> unitOL (B (TBlock bid))
  237         Nothing  -> nilOL
  238 
  239   switch_code <- foldM mkbranch nilOL (switchTargetsCases targets)
  240   return $ code `appOL` switch_code `appOL` def_code
  241 
  242 -- We don't do jump tables for now, see Ticket 19912
  243 generateJumpTableForInstr :: NCGConfig -> Instr
  244   -> Maybe (NatCmmDecl RawCmmStatics Instr)
  245 generateJumpTableForInstr _ _ = Nothing
  246 
  247 -- -----------------------------------------------------------------------------
  248 -- Top-level of the instruction selector
  249 
  250 -- See Note [Keeping track of the current block] for why
  251 -- we pass the BlockId.
  252 stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
  253               -> [CmmNode O O] -- ^ Cmm Statement
  254               -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
  255 stmtsToInstrs bid stmts =
  256     go bid stmts nilOL
  257   where
  258     go bid  []        instrs = return (instrs,bid)
  259     go bid (s:stmts)  instrs = do
  260       (instrs',bid') <- stmtToInstrs bid s
  261       -- If the statement introduced a new block, we use that one
  262       let !newBid = fromMaybe bid bid'
  263       go newBid stmts (instrs `appOL` instrs')
  264 
  265 -- | `bid` refers to the current block and is used to update the CFG
  266 --   if new blocks are inserted in the control flow.
  267 -- See Note [Keeping track of the current block] for more details.
  268 stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
  269              -> CmmNode e x
  270              -> NatM (InstrBlock, Maybe BlockId)
  271              -- ^ Instructions, and bid of new block if successive
  272              -- statements are placed in a different basic block.
  273 stmtToInstrs bid stmt = do
  274   -- traceM $ "-- -------------------------- stmtToInstrs -------------------------- --\n"
  275   --     ++ showSDocUnsafe (ppr stmt)
  276   platform <- getPlatform
  277   case stmt of
  278     CmmUnsafeForeignCall target result_regs args
  279        -> genCCall target result_regs args bid
  280 
  281     _ -> (,Nothing) <$> case stmt of
  282       CmmComment s   -> return (unitOL (COMMENT (ftext s)))
  283       CmmTick {}     -> return nilOL
  284 
  285       CmmAssign reg src
  286         | isFloatType ty         -> assignReg_FltCode format reg src
  287         | otherwise              -> assignReg_IntCode format reg src
  288           where ty = cmmRegType platform reg
  289                 format = cmmTypeFormat ty
  290 
  291       CmmStore addr src
  292         | isFloatType ty         -> assignMem_FltCode format addr src
  293         | otherwise              -> assignMem_IntCode format addr src
  294           where ty = cmmExprType platform src
  295                 format = cmmTypeFormat ty
  296 
  297       CmmBranch id          -> genBranch id
  298 
  299       --We try to arrange blocks such that the likely branch is the fallthrough
  300       --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
  301       CmmCondBranch arg true false _prediction ->
  302           genCondBranch bid true false arg
  303 
  304       CmmSwitch arg ids -> genSwitch arg ids
  305 
  306       CmmCall { cml_target = arg } -> genJump arg
  307 
  308       CmmUnwind _regs -> return nilOL
  309 
  310       _ -> pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt)
  311 
  312 --------------------------------------------------------------------------------
  313 -- | 'InstrBlock's are the insn sequences generated by the insn selectors.
  314 --      They are really trees of insns to facilitate fast appending, where a
  315 --      left-to-right traversal yields the insns in the correct order.
  316 --
  317 type InstrBlock
  318         = OrdList Instr
  319 
  320 -- | Register's passed up the tree.  If the stix code forces the register
  321 --      to live in a pre-decided machine register, it comes out as @Fixed@;
  322 --      otherwise, it comes out as @Any@, and the parent can decide which
  323 --      register to put it in.
  324 --
  325 data Register
  326         = Fixed Format Reg InstrBlock
  327         | Any   Format (Reg -> InstrBlock)
  328 
  329 -- | Sometimes we need to change the Format of a register. Primarily during
  330 -- conversion.
  331 swizzleRegisterRep :: Format -> Register -> Register
  332 swizzleRegisterRep format (Fixed _ reg code) = Fixed format reg code
  333 swizzleRegisterRep format (Any _ codefn)     = Any   format codefn
  334 
  335 -- | Grab the Reg for a CmmReg
  336 getRegisterReg :: Platform -> CmmReg -> Reg
  337 
  338 getRegisterReg _ (CmmLocal (LocalReg u pk))
  339   = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
  340 
  341 getRegisterReg platform (CmmGlobal mid)
  342   = case globalRegMaybe platform mid of
  343         Just reg -> RegReal reg
  344         Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
  345         -- By this stage, the only MagicIds remaining should be the
  346         -- ones which map to a real machine register on this
  347         -- platform.  Hence if it's not mapped to a registers something
  348         -- went wrong earlier in the pipeline.
  349 -- | Convert a BlockId to some CmmStatic data
  350 -- TODO: Add JumpTable Logic, see Ticket 19912
  351 -- jumpTableEntry :: NCGConfig -> Maybe BlockId -> CmmStatic
  352 -- jumpTableEntry config Nothing   = CmmStaticLit (CmmInt 0 (ncgWordWidth config))
  353 -- jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
  354 --     where blockLabel = blockLbl blockid
  355 
  356 -- -----------------------------------------------------------------------------
  357 -- General things for putting together code sequences
  358 
  359 -- | The dual to getAnyReg: compute an expression into a register, but
  360 --      we don't mind which one it is.
  361 getSomeReg :: CmmExpr -> NatM (Reg, Format, InstrBlock)
  362 getSomeReg expr = do
  363   r <- getRegister expr
  364   case r of
  365     Any rep code -> do
  366         tmp <- getNewRegNat rep
  367         return (tmp, rep, code tmp)
  368     Fixed rep reg code ->
  369         return (reg, rep, code)
  370 
  371 -- TODO OPT: we might be able give getRegister
  372 --          a hint, what kind of register we want.
  373 getFloatReg :: HasCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
  374 getFloatReg expr = do
  375   r <- getRegister expr
  376   case r of
  377     Any rep code | isFloatFormat rep -> do
  378       tmp <- getNewRegNat rep
  379       return (tmp, rep, code tmp)
  380     Any II32 code -> do
  381       tmp <- getNewRegNat FF32
  382       return (tmp, FF32, code tmp)
  383     Any II64 code -> do
  384       tmp <- getNewRegNat FF64
  385       return (tmp, FF64, code tmp)
  386     Any _w _code -> do
  387       config <- getConfig
  388       pprPanic "can't do getFloatReg on" (pdoc (ncgPlatform config) expr)
  389     -- can't do much for fixed.
  390     Fixed rep reg code ->
  391       return (reg, rep, code)
  392 
  393 -- TODO: TODO, bounds. We can't put any immediate
  394 -- value in. They are constrained.
  395 -- See Ticket 19911
  396 litToImm' :: CmmLit -> NatM (Operand, InstrBlock)
  397 litToImm' lit = return (OpImm (litToImm lit), nilOL)
  398 
  399 
  400 getRegister :: CmmExpr -> NatM Register
  401 getRegister e = do
  402   config <- getConfig
  403   getRegister' config (ncgPlatform config) e
  404 
  405 -- Note [Handling PIC on AArch64]
  406 -- AArch64 does not have a special PIC register, the general approach is to
  407 -- simply go through the GOT, and there is assembly support for this:
  408 --
  409 --   // Load the address of 'sym' from the GOT using ADRP and LDR (used for
  410 --   // position-independent code on AArch64):
  411 --   adrp x0, #:got:sym
  412 --   ldr x0, [x0, #:got_lo12:sym]
  413 --
  414 -- See also: https://developer.arm.com/documentation/dui0774/i/armclang-integrated-assembler-directives/assembly-expressions
  415 --
  416 -- CmmGlobal @PicBaseReg@'s are generated in @GHC.CmmToAsm.PIC@ in the
  417 -- @cmmMakePicReference@.  This is in turn called from @cmmMakeDynamicReference@
  418 -- also in @Cmm.CmmToAsm.PIC@ from where it is also exported.  There are two
  419 -- callsites for this. One is in this module to produce the @target@ in @genCCall@
  420 -- the other is in @GHC.CmmToAsm@ in @cmmExprNative@.
  421 --
  422 -- Conceptually we do not want any special PicBaseReg to be used on AArch64. If
  423 -- we want to distinguish between symbol loading, we need to address this through
  424 -- the way we load it, not through a register.
  425 --
  426 
  427 getRegister' :: NCGConfig -> Platform -> CmmExpr -> NatM Register
  428 -- OPTIMIZATION WARNING: CmmExpr rewrites
  429 -- 1. Rewrite: Reg + (-n) => Reg - n
  430 --    TODO: this expression shouldn't even be generated to begin with.
  431 getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt i w1)]) | i < 0
  432   = getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt (-i) w1)])
  433 
  434 getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt i w1)]) | i < 0
  435   = getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt (-i) w1)])
  436 
  437 
  438 -- Generic case.
  439 getRegister' config plat expr
  440   = case expr of
  441     CmmReg (CmmGlobal PicBaseReg)
  442       -> pprPanic "getRegisterReg-memory" (ppr $ PicBaseReg)
  443     CmmLit lit
  444       -> case lit of
  445 
  446         -- TODO handle CmmInt 0 specially, use wzr or xzr.
  447 
  448         CmmInt i W8 | i >= 0 -> do
  449           return (Any (intFormat W8) (\dst -> unitOL $ annExpr expr (MOV (OpReg W8 dst) (OpImm (ImmInteger (narrowU W8 i))))))
  450         CmmInt i W16 | i >= 0 -> do
  451           return (Any (intFormat W16) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger (narrowU W16 i))))))
  452 
  453         CmmInt i W8  -> do
  454           return (Any (intFormat W8) (\dst -> unitOL $ annExpr expr (MOV (OpReg W8 dst) (OpImm (ImmInteger (narrowS W8 i))))))
  455         CmmInt i W16 -> do
  456           return (Any (intFormat W16) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger (narrowS W16 i))))))
  457 
  458         -- We need to be careful to not shorten this for negative literals.
  459         -- Those need the upper bits set. We'd either have to explicitly sign
  460         -- or figure out something smarter. Lowered to
  461         -- `MOV dst XZR`
  462         CmmInt i w | isNbitEncodeable 16 i, i >= 0 -> do
  463           return (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger i)))))
  464         CmmInt i w | isNbitEncodeable 32 i, i >= 0 -> do
  465           let  half0 = fromIntegral (fromIntegral i :: Word16)
  466                half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
  467           return (Any (intFormat w) (\dst -> toOL [ annExpr expr
  468                                                   $ MOV (OpReg W32 dst) (OpImm (ImmInt half0))
  469                                                   , MOVK (OpReg W32 dst) (OpImmShift (ImmInt half1) SLSL 16)
  470                                                   ]))
  471         -- fallback for W32
  472         CmmInt i W32 -> do
  473           let  half0 = fromIntegral (fromIntegral i :: Word16)
  474                half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
  475           return (Any (intFormat W32) (\dst -> toOL [ annExpr expr
  476                                                     $ MOV (OpReg W32 dst) (OpImm (ImmInt half0))
  477                                                     , MOVK (OpReg W32 dst) (OpImmShift (ImmInt half1) SLSL 16)
  478                                                     ]))
  479         -- anything else
  480         CmmInt i W64 -> do
  481           let  half0 = fromIntegral (fromIntegral i :: Word16)
  482                half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
  483                half2 = fromIntegral (fromIntegral (i `shiftR` 32) :: Word16)
  484                half3 = fromIntegral (fromIntegral (i `shiftR` 48) :: Word16)
  485           return (Any (intFormat W64) (\dst -> toOL [ annExpr expr
  486                                                     $ MOV (OpReg W64 dst) (OpImm (ImmInt half0))
  487                                                     , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half1) SLSL 16)
  488                                                     , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half2) SLSL 32)
  489                                                     , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half3) SLSL 48)
  490                                                     ]))
  491         CmmInt _i rep -> do
  492           (op, imm_code) <- litToImm' lit
  493           return (Any (intFormat rep) (\dst -> imm_code `snocOL` annExpr expr (MOV (OpReg rep dst) op)))
  494 
  495         -- floatToBytes (fromRational f)
  496         CmmFloat 0 w   -> do
  497           (op, imm_code) <- litToImm' lit
  498           return (Any (floatFormat w) (\dst -> imm_code `snocOL` annExpr expr (MOV (OpReg w dst) op)))
  499 
  500         CmmFloat _f W8  -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (pdoc plat expr)
  501         CmmFloat _f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (pdoc plat expr)
  502         CmmFloat f W32 -> do
  503           let word = castFloatToWord32 (fromRational f) :: Word32
  504               half0 = fromIntegral (fromIntegral word :: Word16)
  505               half1 = fromIntegral (fromIntegral (word `shiftR` 16) :: Word16)
  506           tmp <- getNewRegNat (intFormat W32)
  507           return (Any (floatFormat W32) (\dst -> toOL [ annExpr expr
  508                                                       $ MOV (OpReg W32 tmp) (OpImm (ImmInt half0))
  509                                                       , MOVK (OpReg W32 tmp) (OpImmShift (ImmInt half1) SLSL 16)
  510                                                       , MOV (OpReg W32 dst) (OpReg W32 tmp)
  511                                                       ]))
  512         CmmFloat f W64 -> do
  513           let word = castDoubleToWord64 (fromRational f) :: Word64
  514               half0 = fromIntegral (fromIntegral word :: Word16)
  515               half1 = fromIntegral (fromIntegral (word `shiftR` 16) :: Word16)
  516               half2 = fromIntegral (fromIntegral (word `shiftR` 32) :: Word16)
  517               half3 = fromIntegral (fromIntegral (word `shiftR` 48) :: Word16)
  518           tmp <- getNewRegNat (intFormat W64)
  519           return (Any (floatFormat W64) (\dst -> toOL [ annExpr expr
  520                                                       $ MOV (OpReg W64 tmp) (OpImm (ImmInt half0))
  521                                                       , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half1) SLSL 16)
  522                                                       , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half2) SLSL 32)
  523                                                       , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half3) SLSL 48)
  524                                                       , MOV (OpReg W64 dst) (OpReg W64 tmp)
  525                                                       ]))
  526         CmmFloat _f _w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (pdoc plat expr)
  527         CmmVec _ -> pprPanic "getRegister' (CmmLit:CmmVec): " (pdoc plat expr)
  528         CmmLabel _lbl -> do
  529           (op, imm_code) <- litToImm' lit
  530           let rep = cmmLitType plat lit
  531               format = cmmTypeFormat rep
  532           return (Any format (\dst -> imm_code `snocOL` (annExpr expr $ LDR format (OpReg (formatToWidth format) dst) op)))
  533 
  534         CmmLabelOff _lbl off | isNbitEncodeable 12 (fromIntegral off) -> do
  535           (op, imm_code) <- litToImm' lit
  536           let rep = cmmLitType plat lit
  537               format = cmmTypeFormat rep
  538               -- width = typeWidth rep
  539           return (Any format (\dst -> imm_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op))
  540 
  541         CmmLabelOff lbl off -> do
  542           (op, imm_code) <- litToImm' (CmmLabel lbl)
  543           let rep = cmmLitType plat lit
  544               format = cmmTypeFormat rep
  545               width = typeWidth rep
  546           (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
  547           return (Any format (\dst -> imm_code `appOL` off_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op `snocOL` ADD (OpReg width dst) (OpReg width dst) (OpReg width off_r)))
  548 
  549         CmmLabelDiffOff _ _ _ _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
  550         CmmBlock _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
  551         CmmHighStackMark -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
  552     CmmLoad mem rep -> do
  553       Amode addr addr_code <- getAmode plat mem
  554       let format = cmmTypeFormat rep
  555       return (Any format (\dst -> addr_code `snocOL` LDR format (OpReg (formatToWidth format) dst) (OpAddr addr)))
  556     CmmStackSlot _ _
  557       -> pprPanic "getRegister' (CmmStackSlot): " (pdoc plat expr)
  558     CmmReg reg
  559       -> return (Fixed (cmmTypeFormat (cmmRegType plat reg))
  560                        (getRegisterReg plat reg)
  561                        nilOL)
  562     CmmRegOff reg off | isNbitEncodeable 12 (fromIntegral off) -> do
  563       getRegister' config plat $
  564             CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
  565           where width = typeWidth (cmmRegType plat reg)
  566 
  567     CmmRegOff reg off -> do
  568       (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
  569       (reg, _format, code) <- getSomeReg $ CmmReg reg
  570       return $ Any (intFormat width) (\dst -> off_code `appOL` code `snocOL` ADD (OpReg width dst) (OpReg width reg) (OpReg width off_r))
  571           where width = typeWidth (cmmRegType plat reg)
  572 
  573 
  574 
  575     -- for MachOps, see GHC.Cmm.MachOp
  576     -- For CmmMachOp, see GHC.Cmm.Expr
  577     CmmMachOp op [e] -> do
  578       (reg, _format, code) <- getSomeReg e
  579       case op of
  580         MO_Not w -> return $ Any (intFormat w) (\dst -> code `snocOL` MVN (OpReg w dst) (OpReg w reg))
  581 
  582         MO_S_Neg w -> return $ Any (intFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))
  583         MO_F_Neg w -> return $ Any (floatFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))
  584 
  585         MO_SF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` SCVTF (OpReg to dst) (OpReg from reg))  -- (Signed ConVerT Float)
  586         MO_FS_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from reg)) -- (float convert (-> zero) signed)
  587 
  588         -- TODO this is very hacky
  589         -- Note, UBFM and SBFM expect source and target register to be of the same size, so we'll use @max from to@
  590         -- UBFM will set the high bits to 0. SBFM will copy the sign (sign extend).
  591         MO_UU_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` UBFM (OpReg (max from to) dst) (OpReg (max from to) reg) (OpImm (ImmInt 0)) (toImm (min from to)))
  592         MO_SS_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` SBFM (OpReg (max from to) dst) (OpReg (max from to) reg) (OpImm (ImmInt 0)) (toImm (min from to)))
  593         MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` FCVT (OpReg to dst) (OpReg from reg))
  594 
  595         -- Conversions
  596         MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e
  597 
  598         _ -> pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
  599       where toImm W8 =  (OpImm (ImmInt 7))
  600             toImm W16 = (OpImm (ImmInt 15))
  601             toImm W32 = (OpImm (ImmInt 31))
  602             toImm W64 = (OpImm (ImmInt 63))
  603             toImm W128 = (OpImm (ImmInt 127))
  604             toImm W256 = (OpImm (ImmInt 255))
  605             toImm W512 = (OpImm (ImmInt 511))
  606     -- Dyadic machops:
  607     --
  608     -- The general idea is:
  609     -- compute x<i> <- x
  610     -- compute x<j> <- y
  611     -- OP x<r>, x<i>, x<j>
  612     --
  613     -- TODO: for now we'll only implement the 64bit versions. And rely on the
  614     --      fallthrough to alert us if things go wrong!
  615     -- OPTIMIZATION WARNING: Dyadic CmmMachOp destructuring
  616     -- 0. TODO This should not exist! Rewrite: Reg +- 0 -> Reg
  617     CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
  618     CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
  619     -- 1. Compute Reg +/- n directly.
  620     --    For Add/Sub we can directly encode 12bits, or 12bits lsl #12.
  621     CmmMachOp (MO_Add w) [(CmmReg reg), CmmLit (CmmInt n _)]
  622       | n > 0 && n < 4096 -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ADD (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
  623       -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
  624       where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
  625             r' = getRegisterReg plat reg
  626     CmmMachOp (MO_Sub w) [(CmmReg reg), CmmLit (CmmInt n _)]
  627       | n > 0 && n < 4096 -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (SUB (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
  628       -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
  629       where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
  630             r' = getRegisterReg plat reg
  631 
  632     CmmMachOp (MO_U_Quot w) [x, y] | w == W8 -> do
  633       (reg_x, _format_x, code_x) <- getSomeReg x
  634       (reg_y, _format_y, code_y) <- getSomeReg y
  635       return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTB (OpReg w reg_x) (OpReg w reg_x)) `snocOL` (UXTB (OpReg w reg_y) (OpReg w reg_y)) `snocOL` (UDIV (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)))
  636     CmmMachOp (MO_U_Quot w) [x, y] | w == W16 -> do
  637       (reg_x, _format_x, code_x) <- getSomeReg x
  638       (reg_y, _format_y, code_y) <- getSomeReg y
  639       return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTH (OpReg w reg_x) (OpReg w reg_x)) `snocOL` (UXTH (OpReg w reg_y) (OpReg w reg_y)) `snocOL` (UDIV (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)))
  640 
  641     -- 2. Shifts. x << n, x >> n.
  642     CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W32, 0 <= n, n < 32 -> do
  643       (reg_x, _format_x, code_x) <- getSomeReg x
  644       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
  645     CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W64, 0 <= n, n < 64 -> do
  646       (reg_x, _format_x, code_x) <- getSomeReg x
  647       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
  648 
  649     CmmMachOp (MO_S_Shr w) [x, (CmmLit (CmmInt n _))] | w == W8, 0 <= n, n < 8 -> do
  650       (reg_x, _format_x, code_x) <- getSomeReg x
  651       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (SBFX (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)) (OpImm (ImmInteger (8-n)))))
  652     CmmMachOp (MO_S_Shr w) [x, y] | w == W8 -> do
  653       (reg_x, _format_x, code_x) <- getSomeReg x
  654       (reg_y, _format_y, code_y) <- getSomeReg y
  655       return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (SXTB (OpReg w reg_x) (OpReg w reg_x)) `snocOL` (ASR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)))
  656 
  657     CmmMachOp (MO_S_Shr w) [x, (CmmLit (CmmInt n _))] | w == W16, 0 <= n, n < 16 -> do
  658       (reg_x, _format_x, code_x) <- getSomeReg x
  659       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (SBFX (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)) (OpImm (ImmInteger (16-n)))))
  660     CmmMachOp (MO_S_Shr w) [x, y] | w == W16 -> do
  661       (reg_x, _format_x, code_x) <- getSomeReg x
  662       (reg_y, _format_y, code_y) <- getSomeReg y
  663       return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (SXTH (OpReg w reg_x) (OpReg w reg_x)) `snocOL` (ASR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)))
  664 
  665     CmmMachOp (MO_S_Shr w) [x, (CmmLit (CmmInt n _))] | w == W32, 0 <= n, n < 32 -> do
  666       (reg_x, _format_x, code_x) <- getSomeReg x
  667       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (ASR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
  668 
  669     CmmMachOp (MO_S_Shr w) [x, (CmmLit (CmmInt n _))] | w == W64, 0 <= n, n < 64 -> do
  670       (reg_x, _format_x, code_x) <- getSomeReg x
  671       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (ASR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
  672 
  673 
  674     CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W8, 0 <= n, n < 8 -> do
  675       (reg_x, _format_x, code_x) <- getSomeReg x
  676       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (UBFX (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)) (OpImm (ImmInteger (8-n)))))
  677     CmmMachOp (MO_U_Shr w) [x, y] | w == W8 -> do
  678       (reg_x, _format_x, code_x) <- getSomeReg x
  679       (reg_y, _format_y, code_y) <- getSomeReg y
  680       return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTB (OpReg w reg_x) (OpReg w reg_x)) `snocOL` (ASR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)))
  681 
  682     CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W16, 0 <= n, n < 16 -> do
  683       (reg_x, _format_x, code_x) <- getSomeReg x
  684       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (UBFX (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n)) (OpImm (ImmInteger (16-n)))))
  685     CmmMachOp (MO_U_Shr w) [x, y] | w == W16 -> do
  686       (reg_x, _format_x, code_x) <- getSomeReg x
  687       (reg_y, _format_y, code_y) <- getSomeReg y
  688       return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTH (OpReg w reg_x) (OpReg w reg_x)) `snocOL` (ASR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)))
  689 
  690     CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W32, 0 <= n, n < 32 -> do
  691       (reg_x, _format_x, code_x) <- getSomeReg x
  692       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
  693 
  694     CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W64, 0 <= n, n < 64 -> do
  695       (reg_x, _format_x, code_x) <- getSomeReg x
  696       return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
  697 
  698     -- 3. Logic &&, ||
  699     CmmMachOp (MO_And w) [(CmmReg reg), CmmLit (CmmInt n _)] | isBitMaskImmediate (fromIntegral n) ->
  700       return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (AND (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
  701       where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
  702             r' = getRegisterReg plat reg
  703 
  704     CmmMachOp (MO_Or w) [(CmmReg reg), CmmLit (CmmInt n _)] | isBitMaskImmediate (fromIntegral n) ->
  705       return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ORR (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
  706       where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
  707             r' = getRegisterReg plat reg
  708 
  709     -- Generic case.
  710     CmmMachOp op [x, y] -> do
  711       -- alright, so we have an operation, and two expressions. And we want to essentially do
  712       -- ensure we get float regs
  713       let genOp w op = do
  714             (reg_x, format_x, code_x) <- getSomeReg x
  715             (reg_y, format_y, code_y) <- getSomeReg y
  716             when ((isFloatFormat format_x && isIntFormat format_y) || (isIntFormat format_x && isFloatFormat format_y)) $ pprPanic "getRegister:genOp" (text "formats don't match:" <+> text (show format_x) <+> text "/=" <+> text (show format_y))
  717             return $ Any format_x (\dst -> code_x `appOL` code_y `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
  718 
  719           withTempIntReg w op = OpReg w <$> getNewRegNat (intFormat w) >>= op
  720           -- withTempFloatReg w op = OpReg w <$> getNewRegNat (floatFormat w) >>= op
  721 
  722           intOp w op = do
  723             -- compute x<m> <- x
  724             -- compute x<o> <- y
  725             -- <OP> x<n>, x<m>, x<o>
  726             (reg_x, _format_x, code_x) <- getSomeReg x
  727             (reg_y, _format_y, code_y) <- getSomeReg y
  728             return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
  729           floatOp w op = do
  730             (reg_fx, _format_x, code_fx) <- getFloatReg x
  731             (reg_fy, _format_y, code_fy) <- getFloatReg y
  732             return $ Any (floatFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))
  733           -- need a special one for conditionals, as they return ints
  734           floatCond w op = do
  735             (reg_fx, _format_x, code_fx) <- getFloatReg x
  736             (reg_fy, _format_y, code_fy) <- getFloatReg y
  737             return $ Any (intFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))
  738 
  739       case op of
  740         -- Integer operations
  741         -- Add/Sub should only be Interger Options.
  742         -- But our Cmm parser doesn't care about types
  743         -- and thus we end up with <float> + <float> => MO_Add <float> <float>
  744         MO_Add w -> genOp w (\d x y -> unitOL $ annExpr expr (ADD d x y))
  745         MO_Sub w -> genOp w (\d x y -> unitOL $ annExpr expr (SUB d x y))
  746 
  747         -- Note [CSET]
  748         --
  749         -- Setting conditional flags: the architecture internally knows the
  750         -- following flag bits.  And based on thsoe comparisons as in the
  751         -- table below.
  752         --
  753         --    31  30  29  28
  754         --  .---+---+---+---+-- - -
  755         --  | N | Z | C | V |
  756         --  '---+---+---+---+-- - -
  757         --  Negative
  758         --  Zero
  759         --  Carry
  760         --  oVerflow
  761         --
  762         --  .------+-------------------------------------+-----------------+----------.
  763         --  | Code | Meaning                             | Flags           | Encoding |
  764         --  |------+-------------------------------------+-----------------+----------|
  765         --  |  EQ  | Equal                               | Z = 1           | 0000     |
  766         --  |  NE  | Not Equal                           | Z = 0           | 0001     |
  767         --  |  HI  | Unsigned Higher                     | C = 1 && Z = 0  | 1000     |
  768         --  |  HS  | Unsigned Higher or Same             | C = 1           | 0010     |
  769         --  |  LS  | Unsigned Lower or Same              | C = 0 || Z = 1  | 1001     |
  770         --  |  LO  | Unsigned Lower                      | C = 0           | 0011     |
  771         --  |  GT  | Signed Greater Than                 | Z = 0 && N = V  | 1100     |
  772         --  |  GE  | Signed Greater Than or Equal        | N = V           | 1010     |
  773         --  |  LE  | Signed Less Than or Equal           | Z = 1 || N /= V | 1101     |
  774         --  |  LT  | Signed Less Than                    | N /= V          | 1011     |
  775         --  |  CS  | Carry Set (Unsigned Overflow)       | C = 1           | 0010     |
  776         --  |  CC  | Carry Clear (No Unsigned Overflow)  | C = 0           | 0011     |
  777         --  |  VS  | Signed Overflow                     | V = 1           | 0110     |
  778         --  |  VC  | No Signed Overflow                  | V = 0           | 0111     |
  779         --  |  MI  | Minus, Negative                     | N = 1           | 0100     |
  780         --  |  PL  | Plus, Positive or Zero (!)          | N = 0           | 0101     |
  781         --  |  AL  | Always                              | Any             | 1110     |
  782         --  |  NV  | Never                               | Any             | 1111     |
  783         --- '-------------------------------------------------------------------------'
  784         MO_Eq w@W8  -> intOp w (\d x y -> toOL [ SXTB x x, SXTB y y, CMP x y, CSET d EQ ])
  785         MO_Eq w@W16 -> intOp w (\d x y -> toOL [ SXTH x x, SXTH y y, CMP x y, CSET d EQ ])
  786         MO_Eq w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d EQ ])
  787         MO_Ne w@W8  -> intOp w (\d x y -> toOL [ SXTB x x, SXTB y y, CMP x y, CSET d NE ])
  788         MO_Ne w@W16 -> intOp w (\d x y -> toOL [ SXTH x x, SXTH y y, CMP x y, CSET d NE ])
  789         MO_Ne w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d NE ])
  790         MO_Mul w    -> intOp w (\d x y -> unitOL $ MUL d x y)
  791 
  792         -- Signed multiply/divide
  793         MO_S_MulMayOflo w -> intOp w (\d x y -> toOL [ MUL d x y, CSET d VS ])
  794         MO_S_Quot w -> intOp w (\d x y -> unitOL $ SDIV d x y)
  795 
  796         -- No native rem instruction. So we'll compute the following
  797         -- Rd  <- Rx / Ry             | 2 <- 7 / 3      -- SDIV Rd Rx Ry
  798         -- Rd' <- Rx - Rd * Ry        | 1 <- 7 - 2 * 3  -- MSUB Rd' Rd Ry Rx
  799         --        |     '---|----------------|---'   |
  800         --        |         '----------------|-------'
  801         --        '--------------------------'
  802         -- Note the swap in Rx and Ry.
  803         MO_S_Rem w -> withTempIntReg w $ \t ->
  804           intOp w (\d x y -> toOL [ SDIV t x y, MSUB d t y x ])
  805 
  806         -- Unsigned multiply/divide
  807         MO_U_MulMayOflo _w -> unsupportedP plat expr
  808         MO_U_Quot w -> intOp w (\d x y -> unitOL $ UDIV d x y)
  809         MO_U_Rem w  -> withTempIntReg w $ \t ->
  810           intOp w (\d x y -> toOL [ UDIV t x y, MSUB d t y x ])
  811 
  812         -- Signed comparisons -- see Note [CSET]
  813         MO_S_Ge w@W8  -> intOp w (\d x y -> toOL [ SXTB x x, SXTB y y, CMP x y, CSET d SGE ])
  814         MO_S_Ge w@W16 -> intOp w (\d x y -> toOL [ SXTH x x, SXTH y y, CMP x y, CSET d SGE ])
  815         MO_S_Ge w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d SGE ])
  816         MO_S_Le w@W8  -> intOp w (\d x y -> toOL [ SXTB x x, SXTB y y, CMP x y, CSET d SLE ])
  817         MO_S_Le w@W16 -> intOp w (\d x y -> toOL [ SXTH x x, SXTH y y, CMP x y, CSET d SLE ])
  818         MO_S_Le w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d SLE ])
  819         MO_S_Gt w@W8  -> intOp w (\d x y -> toOL [ SXTB x x, SXTB y y, CMP x y, CSET d SGT ])
  820         MO_S_Gt w@W16 -> intOp w (\d x y -> toOL [ SXTH x x, SXTH y y, CMP x y, CSET d SGT ])
  821         MO_S_Gt w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d SGT ])
  822         MO_S_Lt w@W8  -> intOp w (\d x y -> toOL [ SXTB x x, SXTB y y, CMP x y, CSET d SLT ])
  823         MO_S_Lt w@W16 -> intOp w (\d x y -> toOL [ SXTH x x, SXTH y y, CMP x y, CSET d SLT ])
  824         MO_S_Lt w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d SLT ])
  825 
  826         -- Unsigned comparisons
  827         MO_U_Ge w@W8  -> intOp w (\d x y -> toOL [ UXTB x x, UXTB y y, CMP x y, CSET d UGE ])
  828         MO_U_Ge w@W16 -> intOp w (\d x y -> toOL [ UXTH x x, UXTH y y, CMP x y, CSET d UGE ])
  829         MO_U_Ge w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d UGE ])
  830         MO_U_Le w@W8  -> intOp w (\d x y -> toOL [ UXTB x x, UXTB y y, CMP x y, CSET d ULE ])
  831         MO_U_Le w@W16 -> intOp w (\d x y -> toOL [ UXTH x x, UXTH y y, CMP x y, CSET d ULE ])
  832         MO_U_Le w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d ULE ])
  833         MO_U_Gt w@W8  -> intOp w (\d x y -> toOL [ UXTB x x, UXTB y y, CMP x y, CSET d UGT ])
  834         MO_U_Gt w@W16 -> intOp w (\d x y -> toOL [ UXTH x x, UXTH y y, CMP x y, CSET d UGT ])
  835         MO_U_Gt w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d UGT ])
  836         MO_U_Lt w@W8  -> intOp w (\d x y -> toOL [ UXTB x x, UXTB y y, CMP x y, CSET d ULT ])
  837         MO_U_Lt w@W16 -> intOp w (\d x y -> toOL [ UXTH x x, UXTH y y, CMP x y, CSET d ULT ])
  838         MO_U_Lt w     -> intOp w (\d x y -> toOL [                     CMP x y, CSET d ULT ])
  839 
  840         -- Floating point arithmetic
  841         MO_F_Add w   -> floatOp w (\d x y -> unitOL $ ADD d x y)
  842         MO_F_Sub w   -> floatOp w (\d x y -> unitOL $ SUB d x y)
  843         MO_F_Mul w   -> floatOp w (\d x y -> unitOL $ MUL d x y)
  844         MO_F_Quot w  -> floatOp w (\d x y -> unitOL $ SDIV d x y)
  845 
  846         -- Floating point comparison
  847         MO_F_Eq w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d EQ ])
  848         MO_F_Ne w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d NE ])
  849 
  850         -- careful with the floating point operations.
  851         -- SLE is effectively LE or unordered (NaN)
  852         -- SLT is the same. ULE, and ULT will not return true for NaN.
  853         -- This is a bit counter intutive. Don't let yourself be fooled by
  854         -- the S/U prefix for floats, it's only meaningful for integers.
  855         MO_F_Ge w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OGE ])
  856         MO_F_Le w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OLE ]) -- x <= y <=> y > x
  857         MO_F_Gt w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OGT ])
  858         MO_F_Lt w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OLT ]) -- x < y <=> y >= x
  859 
  860         -- Bitwise operations
  861         MO_And   w -> intOp w (\d x y -> unitOL $ AND d x y)
  862         MO_Or    w -> intOp w (\d x y -> unitOL $ ORR d x y)
  863         MO_Xor   w -> intOp w (\d x y -> unitOL $ EOR d x y)
  864         -- MO_Not   W64 ->
  865         MO_Shl   w -> intOp w (\d x y -> unitOL $ LSL d x y)
  866         MO_U_Shr w -> intOp w (\d x y -> unitOL $ LSR d x y)
  867         MO_S_Shr w -> intOp w (\d x y -> unitOL $ ASR d x y)
  868 
  869         -- TODO
  870 
  871         op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
  872     CmmMachOp _op _xs
  873       -> pprPanic "getRegister' (variadic CmmMachOp): " (pdoc plat expr)
  874 
  875   where
  876     unsupportedP :: OutputableP env a => env -> a -> b
  877     unsupportedP platform op = pprPanic "Unsupported op:" (pdoc platform op)
  878 
  879     isNbitEncodeable :: Int -> Integer -> Bool
  880     isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
  881     -- This needs to check if n can be encoded as a bitmask immediate:
  882     --
  883     -- See https://stackoverflow.com/questions/30904718/range-of-immediate-values-in-armv8-a64-assembly
  884     --
  885     isBitMaskImmediate :: Integer -> Bool
  886     isBitMaskImmediate i = i `elem` [0b0000_0001, 0b0000_0010, 0b0000_0100, 0b0000_1000, 0b0001_0000, 0b0010_0000, 0b0100_0000, 0b1000_0000
  887                                     ,0b0000_0011, 0b0000_0110, 0b0000_1100, 0b0001_1000, 0b0011_0000, 0b0110_0000, 0b1100_0000
  888                                     ,0b0000_0111, 0b0000_1110, 0b0001_1100, 0b0011_1000, 0b0111_0000, 0b1110_0000
  889                                     ,0b0000_1111, 0b0001_1110, 0b0011_1100, 0b0111_1000, 0b1111_0000
  890                                     ,0b0001_1111, 0b0011_1110, 0b0111_1100, 0b1111_1000
  891                                     ,0b0011_1111, 0b0111_1110, 0b1111_1100
  892                                     ,0b0111_1111, 0b1111_1110
  893                                     ,0b1111_1111]
  894 
  895 
  896 -- -----------------------------------------------------------------------------
  897 --  The 'Amode' type: Memory addressing modes passed up the tree.
  898 data Amode = Amode AddrMode InstrBlock
  899 
  900 getAmode :: Platform -> CmmExpr -> NatM Amode
  901 -- TODO: Specialize stuff we can destructure here.
  902 
  903 -- OPTIMIZATION WARNING: Addressing modes.
  904 -- Addressing options:
  905 -- LDUR/STUR: imm9: -256 - 255
  906 getAmode platform (CmmRegOff reg off) | -256 <= off, off <= 255
  907   = return $ Amode (AddrRegImm reg' off') nilOL
  908     where reg' = getRegisterReg platform reg
  909           off' = ImmInt off
  910 -- LDR/STR: imm12: if reg is 32bit: 0 -- 16380 in multiples of 4
  911 getAmode platform (CmmRegOff reg off)
  912   | typeWidth (cmmRegType platform reg) == W32, 0 <= off, off <= 16380, off `mod` 4 == 0
  913   = return $ Amode (AddrRegImm reg' off') nilOL
  914     where reg' = getRegisterReg platform reg
  915           off' = ImmInt off
  916 -- LDR/STR: imm12: if reg is 64bit: 0 -- 32760 in multiples of 8
  917 getAmode platform (CmmRegOff reg off)
  918   | typeWidth (cmmRegType platform reg) == W64, 0 <= off, off <= 32760, off `mod` 8 == 0
  919   = return $ Amode (AddrRegImm reg' off') nilOL
  920     where reg' = getRegisterReg platform reg
  921           off' = ImmInt off
  922 
  923 -- For Stores we often see something like this:
  924 -- CmmStore (CmmMachOp (MO_Add w) [CmmLoad expr, CmmLit (CmmInt n w')]) (expr2)
  925 -- E.g. a CmmStoreOff really. This can be translated to `str $expr2, [$expr, #n ]
  926 -- for `n` in range.
  927 getAmode _platform (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')])
  928   | -256 <= off, off <= 255
  929   = do (reg, _format, code) <- getSomeReg expr
  930        return $ Amode (AddrRegImm reg (ImmInteger off)) code
  931 
  932 getAmode _platform (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')])
  933   | -256 <= -off, -off <= 255
  934   = do (reg, _format, code) <- getSomeReg expr
  935        return $ Amode (AddrRegImm reg (ImmInteger (-off))) code
  936 
  937 -- Generic case
  938 getAmode _platform expr
  939   = do (reg, _format, code) <- getSomeReg expr
  940        return $ Amode (AddrReg reg) code
  941 
  942 -- -----------------------------------------------------------------------------
  943 -- Generating assignments
  944 
  945 -- Assignments are really at the heart of the whole code generation
  946 -- business.  Almost all top-level nodes of any real importance are
  947 -- assignments, which correspond to loads, stores, or register
  948 -- transfers.  If we're really lucky, some of the register transfers
  949 -- will go away, because we can use the destination register to
  950 -- complete the code generation for the right hand side.  This only
  951 -- fails when the right hand side is forced into a fixed register
  952 -- (e.g. the result of a call).
  953 
  954 assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
  955 assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
  956 
  957 assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
  958 assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
  959 
  960 assignMem_IntCode rep addrE srcE
  961   = do
  962     (src_reg, _format, code) <- getSomeReg srcE
  963     platform <- getPlatform
  964     Amode addr addr_code <- getAmode platform addrE
  965     return $ COMMENT (text "CmmStore" <+> parens (text (show addrE)) <+> parens (text (show srcE)))
  966             `consOL` (code
  967             `appOL` addr_code
  968             `snocOL` STR rep (OpReg (formatToWidth rep) src_reg) (OpAddr addr))
  969 
  970 assignReg_IntCode _ reg src
  971   = do
  972     platform <- getPlatform
  973     let dst = getRegisterReg platform reg
  974     r <- getRegister src
  975     return $ case r of
  976       Any _ code              -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` code dst
  977       Fixed format freg fcode -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` (fcode `snocOL` MOV (OpReg (formatToWidth format) dst) (OpReg (formatToWidth format) freg))
  978 
  979 -- Let's treat Floating point stuff
  980 -- as integer code for now. Opaque.
  981 assignMem_FltCode = assignMem_IntCode
  982 assignReg_FltCode = assignReg_IntCode
  983 
  984 -- -----------------------------------------------------------------------------
  985 -- Jumps
  986 genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
  987 genJump expr@(CmmLit (CmmLabel lbl))
  988   = return $ unitOL (annExpr expr (J (TLabel lbl)))
  989 
  990 genJump expr = do
  991     (target, _format, code) <- getSomeReg expr
  992     return (code `appOL` unitOL (annExpr expr (J (TReg target))))
  993 
  994 -- -----------------------------------------------------------------------------
  995 --  Unconditional branches
  996 genBranch :: BlockId -> NatM InstrBlock
  997 genBranch = return . toOL . mkJumpInstr
  998 
  999 -- -----------------------------------------------------------------------------
 1000 -- Conditional branches
 1001 genCondJump
 1002     :: BlockId
 1003     -> CmmExpr
 1004     -> NatM InstrBlock
 1005 genCondJump bid expr = do
 1006     case expr of
 1007       -- Optimized == 0 case.
 1008       CmmMachOp (MO_Eq w) [x, CmmLit (CmmInt 0 _)] -> do
 1009         (reg_x, _format_x, code_x) <- getSomeReg x
 1010         return $ code_x `snocOL` (annExpr expr (CBZ (OpReg w reg_x) (TBlock bid)))
 1011 
 1012       -- Optimized /= 0 case.
 1013       CmmMachOp (MO_Ne w) [x, CmmLit (CmmInt 0 _)] -> do
 1014         (reg_x, _format_x, code_x) <- getSomeReg x
 1015         return $ code_x `snocOL`  (annExpr expr (CBNZ (OpReg w reg_x) (TBlock bid)))
 1016 
 1017       -- Generic case.
 1018       CmmMachOp mop [x, y] -> do
 1019 
 1020         let ubcond w cmp = do
 1021                 -- compute both sides.
 1022                 (reg_x, _format_x, code_x) <- getSomeReg x
 1023                 (reg_y, _format_y, code_y) <- getSomeReg y
 1024                 let x' = OpReg w reg_x
 1025                     y' = OpReg w reg_y
 1026                 return $ case w of
 1027                   W8  -> code_x `appOL` code_y `appOL` toOL [ UXTB x' x', UXTB y' y', CMP x' y', (annExpr expr (BCOND cmp (TBlock bid))) ]
 1028                   W16 -> code_x `appOL` code_y `appOL` toOL [ UXTH x' x', UXTH y' y', CMP x' y', (annExpr expr (BCOND cmp (TBlock bid))) ]
 1029                   _   -> code_x `appOL` code_y `appOL` toOL [                         CMP x' y', (annExpr expr (BCOND cmp (TBlock bid))) ]
 1030 
 1031             sbcond w cmp = do
 1032                 -- compute both sides.
 1033                 (reg_x, _format_x, code_x) <- getSomeReg x
 1034                 (reg_y, _format_y, code_y) <- getSomeReg y
 1035                 let x' = OpReg w reg_x
 1036                     y' = OpReg w reg_y
 1037                 return $ case w of
 1038                   W8  -> code_x `appOL` code_y `appOL` toOL [ SXTB x' x', SXTB y' y', CMP x' y', (annExpr expr (BCOND cmp (TBlock bid))) ]
 1039                   W16 -> code_x `appOL` code_y `appOL` toOL [ SXTH x' x', SXTH y' y', CMP x' y', (annExpr expr (BCOND cmp (TBlock bid))) ]
 1040                   _   -> code_x `appOL` code_y `appOL` toOL [                         CMP x' y', (annExpr expr (BCOND cmp (TBlock bid))) ]
 1041 
 1042             fbcond w cmp = do
 1043               -- ensure we get float regs
 1044               (reg_fx, _format_fx, code_fx) <- getFloatReg x
 1045               (reg_fy, _format_fy, code_fy) <- getFloatReg y
 1046               return $ code_fx `appOL` code_fy `snocOL` CMP (OpReg w reg_fx) (OpReg w reg_fy) `snocOL` (annExpr expr (BCOND cmp (TBlock bid)))
 1047 
 1048         case mop of
 1049           MO_F_Eq w -> fbcond w EQ
 1050           MO_F_Ne w -> fbcond w NE
 1051 
 1052           MO_F_Gt w -> fbcond w OGT
 1053           MO_F_Ge w -> fbcond w OGE
 1054           MO_F_Lt w -> fbcond w OLT
 1055           MO_F_Le w -> fbcond w OLE
 1056 
 1057           MO_Eq w   -> sbcond w EQ
 1058           MO_Ne w   -> sbcond w NE
 1059 
 1060           MO_S_Gt w -> sbcond w SGT
 1061           MO_S_Ge w -> sbcond w SGE
 1062           MO_S_Lt w -> sbcond w SLT
 1063           MO_S_Le w -> sbcond w SLE
 1064           MO_U_Gt w -> ubcond w UGT
 1065           MO_U_Ge w -> ubcond w UGE
 1066           MO_U_Lt w -> ubcond w ULT
 1067           MO_U_Le w -> ubcond w ULE
 1068           _ -> pprPanic "AArch64.genCondJump:case mop: " (text $ show expr)
 1069       _ -> pprPanic "AArch64.genCondJump: " (text $ show expr)
 1070 
 1071 
 1072 genCondBranch
 1073     :: BlockId      -- the source of the jump
 1074     -> BlockId      -- the true branch target
 1075     -> BlockId      -- the false branch target
 1076     -> CmmExpr      -- the condition on which to branch
 1077     -> NatM InstrBlock -- Instructions
 1078 
 1079 genCondBranch _ true false expr = do
 1080   b1 <- genCondJump true expr
 1081   b2 <- genBranch false
 1082   return (b1 `appOL` b2)
 1083 
 1084 -- -----------------------------------------------------------------------------
 1085 --  Generating C calls
 1086 
 1087 -- Now the biggest nightmare---calls.  Most of the nastiness is buried in
 1088 -- @get_arg@, which moves the arguments to the correct registers/stack
 1089 -- locations.  Apart from that, the code is easy.
 1090 --
 1091 -- As per *convention*:
 1092 -- x0-x7:   (volatile) argument registers
 1093 -- x8:      (volatile) indirect result register / Linux syscall no
 1094 -- x9-x15:  (volatile) caller saved regs
 1095 -- x16,x17: (volatile) intra-procedure-call registers
 1096 -- x18:     (volatile) platform register. don't use for portability
 1097 -- x19-x28: (non-volatile) callee save regs
 1098 -- x29:     (non-volatile) frame pointer
 1099 -- x30:                    link register
 1100 -- x31:                    stack pointer / zero reg
 1101 --
 1102 -- Thus, this is what a c function will expect. Find the arguments in x0-x7,
 1103 -- anything above that on the stack.  We'll ignore c functions with more than
 1104 -- 8 arguments for now.  Sorry.
 1105 --
 1106 -- We need to make sure we preserve x9-x15, don't want to touch x16, x17.
 1107 
 1108 -- Note [PLT vs GOT relocations]
 1109 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 1110 -- When linking objects together, we may need to lookup foreign references. That
 1111 -- is symbolic references to functions or values in other objects. When
 1112 -- compiling the object, we can not know where those elements will end up in
 1113 -- memory (relative to the current location). Thus the use of symbols. There
 1114 -- are two types of items we are interested, code segments we want to jump to
 1115 -- and continue execution there (functions, ...), and data items we want to look
 1116 -- up (strings, numbers, ...). For functions we can use the fact that we can use
 1117 -- an intermediate jump without visibility to the programs execution.  If we
 1118 -- want to jump to a function that is simply too far away to reach for the B/BL
 1119 -- instruction, we can create a small piece of code that loads the full target
 1120 -- address and jumps to that on demand. Say f wants to call g, however g is out
 1121 -- of range for a direct jump, we can create a function h in range for f, that
 1122 -- will load the address of g, and jump there. The area where we construct h
 1123 -- is called the Procedure Linking Table (PLT), we have essentially replaced
 1124 -- f -> g with f -> h -> g.  This is fine for function calls.  However if we
 1125 -- want to lookup values, this trick doesn't work, so we need something else.
 1126 -- We will instead reserve a slot in memory, and have a symbol pointing to that
 1127 -- slot. Now what we essentially do is, we reference that slot, and expect that
 1128 -- slot to hold the final resting address of the data we are interested in.
 1129 -- Thus what that symbol really points to is the location of the final data.
 1130 -- The block of memory where we hold all those slots is the Global Offset Table
 1131 -- (GOT).  Instead of x <- $foo, we now do y <- $fooPtr, and x <- [$y].
 1132 --
 1133 -- For JUMP/CALLs we have 26bits (+/- 128MB), for conditional branches we only
 1134 -- have 19bits (+/- 1MB).  Symbol lookups are also within +/- 1MB, thus for most
 1135 -- of the LOAD/STOREs we'd want to use adrp, and add to compute a value within
 1136 -- 4GB of the PC, and load that.  For anything outside of that range, we'd have
 1137 -- to go through the GOT.
 1138 --
 1139 --  adrp x0, <symbol>
 1140 --  add x0, :lo:<symbol>
 1141 --
 1142 -- will compute the address of <symbol> int x0 if <symbol> is within 4GB of the
 1143 -- PC.
 1144 --
 1145 -- If we want to get the slot in the global offset table (GOT), we can do this:
 1146 --
 1147 --   adrp x0, #:got:<symbol>
 1148 --   ldr x0, [x0, #:got_lo12:<symbol>]
 1149 --
 1150 -- this will compute the address anywhere in the addressable 64bit space into
 1151 -- x0, by loading the address from the GOT slot.
 1152 --
 1153 -- To actually get the value of <symbol>, we'd need to ldr x0, x0 still, which
 1154 -- for the first case can be optimized to use ldr x0, [x0, #:lo12:<symbol>]
 1155 -- instaed of the add instruction.
 1156 --
 1157 -- As the memory model for AArch64 for PIC is considered to be +/- 4GB, we do
 1158 -- not need to go through the GOT, unless we want to address the full address
 1159 -- range within 64bit.
 1160 
 1161 genCCall
 1162     :: ForeignTarget      -- function to call
 1163     -> [CmmFormal]        -- where to put the result
 1164     -> [CmmActual]        -- arguments (of mixed type)
 1165     -> BlockId            -- The block we are in
 1166     -> NatM (InstrBlock, Maybe BlockId)
 1167 -- TODO: Specialize where we can.
 1168 -- Generic impl
 1169 genCCall target dest_regs arg_regs bid = do
 1170   -- we want to pass arg_regs into allArgRegs
 1171   -- pprTraceM "genCCall target" (ppr target)
 1172   -- pprTraceM "genCCall formal" (ppr dest_regs)
 1173   -- pprTraceM "genCCall actual" (ppr arg_regs)
 1174 
 1175   case target of
 1176     -- The target :: ForeignTarget call can either
 1177     -- be a foreign procedure with an address expr
 1178     -- and a calling convention.
 1179     ForeignTarget expr _cconv -> do
 1180       (call_target, call_target_code) <- case expr of
 1181         -- if this is a label, let's just directly to it.  This will produce the
 1182         -- correct CALL relocation for BL...
 1183         (CmmLit (CmmLabel lbl)) -> pure (TLabel lbl, nilOL)
 1184         -- ... if it's not a label--well--let's compute the expression into a
 1185         -- register and jump to that. See Note [PLT vs GOT relocations]
 1186         _ -> do (reg, _format, reg_code) <- getSomeReg expr
 1187                 pure (TReg reg, reg_code)
 1188       -- compute the code and register logic for all arg_regs.
 1189       -- this will give us the format information to match on.
 1190       arg_regs' <- mapM getSomeReg arg_regs
 1191 
 1192       -- Now this is stupid.  Our Cmm expressions doesn't carry the proper sizes
 1193       -- so while in Cmm we might get W64 incorrectly for an int, that is W32 in
 1194       -- STG; this thenn breaks packing of stack arguments, if we need to pack
 1195       -- for the pcs, e.g. darwinpcs.  Option one would be to fix the Int type
 1196       -- in Cmm proper. Option two, which we choose here is to use extended Hint
 1197       -- information to contain the size information and use that when packing
 1198       -- arguments, spilled onto the stack.
 1199       let (_res_hints, arg_hints) = foreignTargetHints target
 1200           arg_regs'' = zipWith (\(r, f, c) h -> (r,f,h,c)) arg_regs' arg_hints
 1201 
 1202       platform <- getPlatform
 1203       let packStack = platformOS platform == OSDarwin
 1204 
 1205       (stackSpace', passRegs, passArgumentsCode) <- passArguments packStack allGpArgRegs allFpArgRegs arg_regs'' 0 [] nilOL
 1206 
 1207       -- if we pack the stack, we may need to adjust to multiple of 8byte.
 1208       -- if we don't pack the stack, it will always be multiple of 8.
 1209       let stackSpace = if stackSpace' `mod` 8 /= 0
 1210                        then 8 * (stackSpace' `div` 8 + 1)
 1211                        else stackSpace'
 1212 
 1213       (returnRegs, readResultsCode)   <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL
 1214 
 1215       let moveStackDown 0 = toOL [ PUSH_STACK_FRAME
 1216                                  , DELTA (-16) ]
 1217           moveStackDown i | odd i = moveStackDown (i + 1)
 1218           moveStackDown i = toOL [ PUSH_STACK_FRAME
 1219                                  , SUB (OpReg W64 (regSingle 31)) (OpReg W64 (regSingle 31)) (OpImm (ImmInt (8 * i)))
 1220                                  , DELTA (-8 * i - 16) ]
 1221           moveStackUp 0 = toOL [ POP_STACK_FRAME
 1222                                , DELTA 0 ]
 1223           moveStackUp i | odd i = moveStackUp (i + 1)
 1224           moveStackUp i = toOL [ ADD (OpReg W64 (regSingle 31)) (OpReg W64 (regSingle 31)) (OpImm (ImmInt (8 * i)))
 1225                                , POP_STACK_FRAME
 1226                                , DELTA 0 ]
 1227 
 1228       let code =    call_target_code          -- compute the label (possibly into a register)
 1229             `appOL` moveStackDown (stackSpace `div` 8)
 1230             `appOL` passArgumentsCode         -- put the arguments into x0, ...
 1231             `appOL` (unitOL $ BL call_target passRegs returnRegs) -- branch and link.
 1232             `appOL` readResultsCode           -- parse the results into registers
 1233             `appOL` moveStackUp (stackSpace `div` 8)
 1234       return (code, Nothing)
 1235 
 1236     PrimTarget MO_F32_Fabs
 1237       | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
 1238         unaryFloatOp W32 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
 1239     PrimTarget MO_F64_Fabs
 1240       | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
 1241         unaryFloatOp W64 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
 1242 
 1243     -- or a possibly side-effecting machine operation
 1244     -- mop :: CallishMachOp (see GHC.Cmm.MachOp)
 1245     PrimTarget mop -> do
 1246       -- We'll need config to construct forien targets
 1247       case mop of
 1248         -- 64 bit float ops
 1249         MO_F64_Pwr   -> mkCCall "pow"
 1250 
 1251         MO_F64_Sin   -> mkCCall "sin"
 1252         MO_F64_Cos   -> mkCCall "cos"
 1253         MO_F64_Tan   -> mkCCall "tan"
 1254 
 1255         MO_F64_Sinh  -> mkCCall "sinh"
 1256         MO_F64_Cosh  -> mkCCall "cosh"
 1257         MO_F64_Tanh  -> mkCCall "tanh"
 1258 
 1259         MO_F64_Asin  -> mkCCall "asin"
 1260         MO_F64_Acos  -> mkCCall "acos"
 1261         MO_F64_Atan  -> mkCCall "atan"
 1262 
 1263         MO_F64_Asinh -> mkCCall "asinh"
 1264         MO_F64_Acosh -> mkCCall "acosh"
 1265         MO_F64_Atanh -> mkCCall "atanh"
 1266 
 1267         MO_F64_Log   -> mkCCall "log"
 1268         MO_F64_Log1P -> mkCCall "log1p"
 1269         MO_F64_Exp   -> mkCCall "exp"
 1270         MO_F64_ExpM1 -> mkCCall "expm1"
 1271         MO_F64_Fabs  -> mkCCall "fabs"
 1272         MO_F64_Sqrt  -> mkCCall "sqrt"
 1273 
 1274         -- 32 bit float ops
 1275         MO_F32_Pwr   -> mkCCall "powf"
 1276 
 1277         MO_F32_Sin   -> mkCCall "sinf"
 1278         MO_F32_Cos   -> mkCCall "cosf"
 1279         MO_F32_Tan   -> mkCCall "tanf"
 1280         MO_F32_Sinh  -> mkCCall "sinhf"
 1281         MO_F32_Cosh  -> mkCCall "coshf"
 1282         MO_F32_Tanh  -> mkCCall "tanhf"
 1283         MO_F32_Asin  -> mkCCall "asinf"
 1284         MO_F32_Acos  -> mkCCall "acosf"
 1285         MO_F32_Atan  -> mkCCall "atanf"
 1286         MO_F32_Asinh -> mkCCall "asinhf"
 1287         MO_F32_Acosh -> mkCCall "acoshf"
 1288         MO_F32_Atanh -> mkCCall "atanhf"
 1289         MO_F32_Log   -> mkCCall "logf"
 1290         MO_F32_Log1P -> mkCCall "log1pf"
 1291         MO_F32_Exp   -> mkCCall "expf"
 1292         MO_F32_ExpM1 -> mkCCall "expm1f"
 1293         MO_F32_Fabs  -> mkCCall "fabsf"
 1294         MO_F32_Sqrt  -> mkCCall "sqrtf"
 1295 
 1296         -- 64-bit primops
 1297         MO_I64_ToI   -> mkCCall "hs_int64ToInt"
 1298         MO_I64_FromI -> mkCCall "hs_intToInt64"
 1299         MO_W64_ToW   -> mkCCall "hs_word64ToWord"
 1300         MO_W64_FromW -> mkCCall "hs_wordToWord64"
 1301         MO_x64_Neg   -> mkCCall "hs_neg64"
 1302         MO_x64_Add   -> mkCCall "hs_add64"
 1303         MO_x64_Sub   -> mkCCall "hs_sub64"
 1304         MO_x64_Mul   -> mkCCall "hs_mul64"
 1305         MO_I64_Quot  -> mkCCall "hs_quotInt64"
 1306         MO_I64_Rem   -> mkCCall "hs_remInt64"
 1307         MO_W64_Quot  -> mkCCall "hs_quotWord64"
 1308         MO_W64_Rem   -> mkCCall "hs_remWord64"
 1309         MO_x64_And   -> mkCCall "hs_and64"
 1310         MO_x64_Or    -> mkCCall "hs_or64"
 1311         MO_x64_Xor   -> mkCCall "hs_xor64"
 1312         MO_x64_Not   -> mkCCall "hs_not64"
 1313         MO_x64_Shl   -> mkCCall "hs_uncheckedShiftL64"
 1314         MO_I64_Shr   -> mkCCall "hs_uncheckedIShiftRA64"
 1315         MO_W64_Shr   -> mkCCall "hs_uncheckedShiftRL64"
 1316         MO_x64_Eq    -> mkCCall "hs_eq64"
 1317         MO_x64_Ne    -> mkCCall "hs_ne64"
 1318         MO_I64_Ge    -> mkCCall "hs_geInt64"
 1319         MO_I64_Gt    -> mkCCall "hs_gtInt64"
 1320         MO_I64_Le    -> mkCCall "hs_leInt64"
 1321         MO_I64_Lt    -> mkCCall "hs_ltInt64"
 1322         MO_W64_Ge    -> mkCCall "hs_geWord64"
 1323         MO_W64_Gt    -> mkCCall "hs_gtWord64"
 1324         MO_W64_Le    -> mkCCall "hs_leWord64"
 1325         MO_W64_Lt    -> mkCCall "hs_ltWord64"
 1326 
 1327         -- Conversion
 1328         MO_UF_Conv w        -> mkCCall (word2FloatLabel w)
 1329 
 1330         -- Arithmatic
 1331         -- These are not supported on X86, so I doubt they are used much.
 1332         MO_S_Mul2     _w -> unsupported mop
 1333         MO_S_QuotRem  _w -> unsupported mop
 1334         MO_U_QuotRem  _w -> unsupported mop
 1335         MO_U_QuotRem2 _w -> unsupported mop
 1336         MO_Add2       _w -> unsupported mop
 1337         MO_AddWordC   _w -> unsupported mop
 1338         MO_SubWordC   _w -> unsupported mop
 1339         MO_AddIntC    _w -> unsupported mop
 1340         MO_SubIntC    _w -> unsupported mop
 1341         MO_U_Mul2     _w -> unsupported mop
 1342 
 1343         -- Memory Ordering
 1344         -- TODO DMBSY is probably *way* too much!
 1345         MO_ReadBarrier      ->  return (unitOL DMBSY, Nothing)
 1346         MO_WriteBarrier     ->  return (unitOL DMBSY, Nothing)
 1347         MO_Touch            ->  return (nilOL, Nothing) -- Keep variables live (when using interior pointers)
 1348         -- Prefetch
 1349         MO_Prefetch_Data _n -> return (nilOL, Nothing) -- Prefetch hint.
 1350 
 1351         -- Memory copy/set/move/cmp, with alignment for optimization
 1352 
 1353         -- TODO Optimize and use e.g. quad registers to move memory around instead
 1354         -- of offloading this to memcpy. For small memcpys we can utilize
 1355         -- the 128bit quad registers in NEON to move block of bytes around.
 1356         -- Might also make sense of small memsets? Use xzr? What's the function
 1357         -- call overhead?
 1358         MO_Memcpy  _align   -> mkCCall "memcpy"
 1359         MO_Memset  _align   -> mkCCall "memset"
 1360         MO_Memmove _align   -> mkCCall "memmove"
 1361         MO_Memcmp  _align   -> mkCCall "memcmp"
 1362 
 1363         MO_SuspendThread    -> mkCCall "suspendThread"
 1364         MO_ResumeThread     -> mkCCall "resumeThread"
 1365 
 1366         MO_PopCnt w         -> mkCCall (popCntLabel w)
 1367         MO_Pdep w           -> mkCCall (pdepLabel w)
 1368         MO_Pext w           -> mkCCall (pextLabel w)
 1369         MO_Clz w            -> mkCCall (clzLabel w)
 1370         MO_Ctz w            -> mkCCall (ctzLabel w)
 1371         MO_BSwap w          -> mkCCall (bSwapLabel w)
 1372         MO_BRev w           -> mkCCall (bRevLabel w)
 1373 
 1374         -- -- Atomic read-modify-write.
 1375         MO_AtomicRMW w amop -> mkCCall (atomicRMWLabel w amop)
 1376         MO_AtomicRead w     -> mkCCall (atomicReadLabel w)
 1377         MO_AtomicWrite w    -> mkCCall (atomicWriteLabel w)
 1378         MO_Cmpxchg w        -> mkCCall (cmpxchgLabel w)
 1379         -- -- Should be an AtomicRMW variant eventually.
 1380         -- -- Sequential consistent.
 1381         -- TODO: this should be implemented properly!
 1382         MO_Xchg w           -> mkCCall (xchgLabel w)
 1383 
 1384   where
 1385     unsupported :: Show a => a -> b
 1386     unsupported mop = panic ("outOfLineCmmOp: " ++ show mop
 1387                           ++ " not supported here")
 1388     mkCCall :: FastString -> NatM (InstrBlock, Maybe BlockId)
 1389     mkCCall name = do
 1390       config <- getConfig
 1391       target <- cmmMakeDynamicReference config CallReference $
 1392           mkForeignLabel name Nothing ForeignLabelInThisPackage IsFunction
 1393       let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn
 1394       genCCall (ForeignTarget target cconv) dest_regs arg_regs bid
 1395 
 1396     -- TODO: Optimize using paired stores and loads (STP, LDP). It is
 1397     -- automomatically done by the allocator for us. However it's not optimal,
 1398     -- as we'd rather want to have control over
 1399     --     all spill/load registers, so we can optimize with instructions like
 1400     --       STP xA, xB, [sp, #-16]!
 1401     --     and
 1402     --       LDP xA, xB, sp, #16
 1403     --
 1404     passArguments :: Bool -> [Reg] -> [Reg] -> [(Reg, Format, ForeignHint, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock)
 1405     passArguments _packStack _ _ [] stackSpace accumRegs accumCode = return (stackSpace, accumRegs, accumCode)
 1406     -- passArguments _ _ [] accumCode stackSpace | isEven stackSpace = return $ SUM (OpReg W64 x31) (OpReg W64 x31) OpImm (ImmInt (-8 * stackSpace))
 1407     -- passArguments _ _ [] accumCode stackSpace = return $ SUM (OpReg W64 x31) (OpReg W64 x31) OpImm (ImmInt (-8 * (stackSpace + 1)))
 1408     -- passArguments [] fpRegs (arg0:arg1:args) stack accumCode = do
 1409     --   -- allocate this on the stack
 1410     --   (r0, format0, code_r0) <- getSomeReg arg0
 1411     --   (r1, format1, code_r1) <- getSomeReg arg1
 1412     --   let w0 = formatToWidth format0
 1413     --       w1 = formatToWidth format1
 1414     --       stackCode = unitOL $ STP (OpReg w0 r0) (OpReg w1 R1), (OpAddr (AddrRegImm x31 (ImmInt (stackSpace * 8)))
 1415     --   passArguments gpRegs (fpReg:fpRegs) args (stackCode `appOL` accumCode)
 1416 
 1417       -- float promotion.
 1418       -- According to
 1419       --  ISO/IEC 9899:2018
 1420       --  Information technology — Programming languages — C
 1421       --
 1422       -- e.g.
 1423       -- http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1124.pdf
 1424       -- http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf
 1425       --
 1426       -- GHC would need to know the prototype.
 1427       --
 1428       -- > If the expression that denotes the called function has a type that does not include a
 1429       -- > prototype, the integer promotions are performed on each argument, and arguments that
 1430       -- > have type float are promoted to double.
 1431       --
 1432       -- As we have no way to get prototypes for C yet, we'll *not* promote this
 1433       -- which is in line with the x86_64 backend :(
 1434       --
 1435       -- See the encode_values.cmm test.
 1436       --
 1437       -- We would essentially need to insert an FCVT (OpReg W64 fpReg) (OpReg W32 fpReg)
 1438       -- if w == W32.  But *only* if we don't have a prototype m(
 1439       --
 1440       -- For AArch64 specificies see: https://developer.arm.com/docs/ihi0055/latest/procedure-call-standard-for-the-arm-64-bit-architecture
 1441       --
 1442     -- Still have GP regs, and we want to pass an GP argument.
 1443 
 1444     -- AArch64-Darwin: stack packing and alignment
 1445     --
 1446     -- According to the "Writing ARM64 Code for Apple Platforms" document form
 1447     -- Apple, specifically the section "Handle Data Types and Data Alignment Properly"
 1448     -- we need to not only pack, but also align arguments on the stack.
 1449     --
 1450     -- Data type   Size (in bytes)   Natural alignment (in bytes)
 1451     -- BOOL, bool  1                 1
 1452     -- char        1                 1
 1453     -- short       2                 2
 1454     -- int         4                 4
 1455     -- long        8                 8
 1456     -- long long   8                 8
 1457     -- pointer     8                 8
 1458     -- size_t      8                 8
 1459     -- NSInteger   8                 8
 1460     -- CFIndex     8                 8
 1461     -- fpos_t      8                 8
 1462     -- off_t       8                 8
 1463     --
 1464     -- We can see that types are aligned by their sizes so the easiest way to
 1465     -- guarantee alignment during packing seems to be to pad to a multiple of the
 1466     -- size we want to pack. Failure to get this right can result in pretty
 1467     -- subtle bugs, e.g. #20137.
 1468 
 1469     passArguments pack (gpReg:gpRegs) fpRegs ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isIntFormat format = do
 1470       let w = formatToWidth format
 1471       passArguments pack gpRegs fpRegs args stackSpace (gpReg:accumRegs) (accumCode `appOL` code_r `snocOL` (ann (text "Pass gp argument: " <> ppr r) $ MOV (OpReg w gpReg) (OpReg w r)))
 1472 
 1473     -- Still have FP regs, and we want to pass an FP argument.
 1474     passArguments pack gpRegs (fpReg:fpRegs) ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isFloatFormat format = do
 1475       let w = formatToWidth format
 1476       passArguments pack gpRegs fpRegs args stackSpace (fpReg:accumRegs) (accumCode `appOL` code_r `snocOL` (ann (text "Pass fp argument: " <> ppr r) $ MOV (OpReg w fpReg) (OpReg w r)))
 1477 
 1478     -- No mor regs left to pass. Must pass on stack.
 1479     passArguments pack [] [] ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode = do
 1480       let w = formatToWidth format
 1481           bytes = widthInBits w `div` 8
 1482           space = if pack then bytes else 8
 1483           stackSpace' | pack && stackSpace `mod` space /= 0 = stackSpace + space - (stackSpace `mod` space)
 1484                       | otherwise                           = stackSpace
 1485           stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace'))))
 1486       passArguments pack [] [] args (stackSpace'+space) accumRegs (stackCode `appOL` accumCode)
 1487 
 1488     -- Still have fpRegs left, but want to pass a GP argument. Must be passed on the stack then.
 1489     passArguments pack [] fpRegs ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isIntFormat format = do
 1490       let w = formatToWidth format
 1491           bytes = widthInBits w `div` 8
 1492           space = if pack then bytes else 8
 1493           stackSpace' | pack && stackSpace `mod` space /= 0 = stackSpace + space - (stackSpace `mod` space)
 1494                       | otherwise                           = stackSpace
 1495           stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace'))))
 1496       passArguments pack [] fpRegs args (stackSpace'+space) accumRegs (stackCode `appOL` accumCode)
 1497 
 1498     -- Still have gpRegs left, but want to pass a FP argument. Must be passed on the stack then.
 1499     passArguments pack gpRegs [] ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isFloatFormat format = do
 1500       let w = formatToWidth format
 1501           bytes = widthInBits w `div` 8
 1502           space = if pack then bytes else 8
 1503           stackSpace' | pack && stackSpace `mod` space /= 0 = stackSpace + space - (stackSpace `mod` space)
 1504                       | otherwise                           = stackSpace
 1505           stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace'))))
 1506       passArguments pack gpRegs [] args (stackSpace'+space) accumRegs (stackCode `appOL` accumCode)
 1507 
 1508     passArguments _ _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")
 1509 
 1510     readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg]-> InstrBlock -> NatM ([Reg], InstrBlock)
 1511     readResults _ _ [] accumRegs accumCode = return (accumRegs, accumCode)
 1512     readResults [] _ _ _ _ = do
 1513       platform <- getPlatform
 1514       pprPanic "genCCall, out of gp registers when reading results" (pdoc platform target)
 1515     readResults _ [] _ _ _ = do
 1516       platform <- getPlatform
 1517       pprPanic "genCCall, out of fp registers when reading results" (pdoc platform target)
 1518     readResults (gpReg:gpRegs) (fpReg:fpRegs) (dst:dsts) accumRegs accumCode = do
 1519       -- gp/fp reg -> dst
 1520       platform <- getPlatform
 1521       let rep = cmmRegType platform (CmmLocal dst)
 1522           format = cmmTypeFormat rep
 1523           w   = cmmRegWidth platform (CmmLocal dst)
 1524           r_dst = getRegisterReg platform (CmmLocal dst)
 1525       if isFloatFormat format
 1526         then readResults (gpReg:gpRegs) fpRegs dsts (fpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg))
 1527         else readResults gpRegs (fpReg:fpRegs) dsts (gpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w gpReg))
 1528 
 1529     unaryFloatOp w op arg_reg dest_reg = do
 1530       platform <- getPlatform
 1531       (reg_fx, _format_x, code_fx) <- getFloatReg arg_reg
 1532       let dst = getRegisterReg platform (CmmLocal dest_reg)
 1533       let code = code_fx `appOL` op (OpReg w dst) (OpReg w reg_fx)
 1534       return (code, Nothing)