never executed always true always false
1 {-# LANGUAGE BangPatterns #-}
2 {-# LANGUAGE GADTs #-}
3 {-# LANGUAGE NondecreasingIndentation #-}
4 {-# LANGUAGE TupleSections #-}
5
6 {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
7
8 -----------------------------------------------------------------------------
9 --
10 -- Generating machine code (instruction selection)
11 --
12 -- (c) The University of Glasgow 1996-2004
13 --
14 -----------------------------------------------------------------------------
15
16 -- This is a big module, but, if you pay attention to
17 -- (a) the sectioning, and (b) the type signatures, the
18 -- structure should not be too overwhelming.
19
20 module GHC.CmmToAsm.X86.CodeGen (
21 cmmTopCodeGen,
22 generateJumpTableForInstr,
23 extractUnwindPoints,
24 invertCondBranches,
25 InstrBlock
26 )
27
28 where
29
30 -- NCG stuff:
31 import GHC.Prelude
32
33 import GHC.CmmToAsm.X86.Instr
34 import GHC.CmmToAsm.X86.Cond
35 import GHC.CmmToAsm.X86.Regs
36 import GHC.CmmToAsm.X86.Ppr
37 import GHC.CmmToAsm.X86.RegInfo
38
39 import GHC.Platform.Regs
40 import GHC.CmmToAsm.CPrim
41 import GHC.CmmToAsm.Types
42 import GHC.Cmm.DebugBlock
43 ( DebugBlock(..), UnwindPoint(..), UnwindTable
44 , UnwindExpr(UwReg), toUnwindExpr
45 )
46 import GHC.CmmToAsm.PIC
47 import GHC.CmmToAsm.Monad
48 ( NatM, getNewRegNat, getNewLabelNat, setDeltaNat
49 , getDeltaNat, getBlockIdNat, getPicBaseNat, getNewRegPairNat
50 , getPicBaseMaybeNat, getDebugBlock, getFileId
51 , addImmediateSuccessorNat, updateCfgNat, getConfig, getPlatform
52 , getCfgWeights
53 )
54 import GHC.CmmToAsm.CFG
55 import GHC.CmmToAsm.Format
56 import GHC.CmmToAsm.Config
57 import GHC.Platform.Reg
58 import GHC.Platform
59
60 -- Our intermediate code:
61 import GHC.Types.Basic
62 import GHC.Cmm.BlockId
63 import GHC.Unit.Types ( primUnitId )
64 import GHC.Cmm.Utils
65 import GHC.Cmm.Switch
66 import GHC.Cmm
67 import GHC.Cmm.Dataflow.Block
68 import GHC.Cmm.Dataflow.Collections
69 import GHC.Cmm.Dataflow.Graph
70 import GHC.Cmm.Dataflow.Label
71 import GHC.Cmm.CLabel
72 import GHC.Types.Tickish ( GenTickish(..) )
73 import GHC.Types.SrcLoc ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
74
75 -- The rest:
76 import GHC.Types.ForeignCall ( CCallConv(..) )
77 import GHC.Data.OrdList
78 import GHC.Utils.Outputable
79 import GHC.Utils.Constants (debugIsOn)
80 import GHC.Utils.Panic
81 import GHC.Utils.Panic.Plain
82 import GHC.Data.FastString
83 import GHC.Driver.Session
84 import GHC.Utils.Misc
85 import GHC.Types.Unique.Supply ( getUniqueM )
86
87 import Control.Monad
88 import Data.Foldable (fold)
89 import Data.Int
90 import Data.Maybe
91 import Data.Word
92
93 import qualified Data.Map as M
94
95 is32BitPlatform :: NatM Bool
96 is32BitPlatform = do
97 platform <- getPlatform
98 return $ target32Bit platform
99
100 sse2Enabled :: NatM Bool
101 sse2Enabled = do
102 config <- getConfig
103 return (ncgSseVersion config >= Just SSE2)
104
105 sse4_2Enabled :: NatM Bool
106 sse4_2Enabled = do
107 config <- getConfig
108 return (ncgSseVersion config >= Just SSE42)
109
110 cmmTopCodeGen
111 :: RawCmmDecl
112 -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr]
113
114 cmmTopCodeGen (CmmProc info lab live graph) = do
115 let blocks = toBlockListEntryFirst graph
116 (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
117 picBaseMb <- getPicBaseMaybeNat
118 platform <- getPlatform
119 let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
120 tops = proc : concat statics
121 os = platformOS platform
122
123 case picBaseMb of
124 Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
125 Nothing -> return tops
126
127 cmmTopCodeGen (CmmData sec dat) =
128 return [CmmData sec (mkAlignment 1, dat)] -- no translation, we just use CmmStatic
129
130 {- Note [Verifying basic blocks]
131 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
132
133 We want to guarantee a few things about the results
134 of instruction selection.
135
136 Namely that each basic blocks consists of:
137 * A (potentially empty) sequence of straight line instructions
138 followed by
139 * A (potentially empty) sequence of jump like instructions.
140
141 We can verify this by going through the instructions and
142 making sure that any non-jumpish instruction can't appear
143 after a jumpish instruction.
144
145 There are gotchas however:
146 * CALLs are strictly speaking control flow but here we care
147 not about them. Hence we treat them as regular instructions.
148
149 It's safe for them to appear inside a basic block
150 as (ignoring side effects inside the call) they will result in
151 straight line code.
152
153 * NEWBLOCK marks the start of a new basic block so can
154 be followed by any instructions.
155 -}
156
157 -- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally.
158 verifyBasicBlock :: Platform -> [Instr] -> ()
159 verifyBasicBlock platform instrs
160 | debugIsOn = go False instrs
161 | otherwise = ()
162 where
163 go _ [] = ()
164 go atEnd (i:instr)
165 = case i of
166 -- Start a new basic block
167 NEWBLOCK {} -> go False instr
168 -- Calls are not viable block terminators
169 CALL {} | atEnd -> faultyBlockWith i
170 | not atEnd -> go atEnd instr
171 -- All instructions ok, check if we reached the end and continue.
172 _ | not atEnd -> go (isJumpishInstr i) instr
173 -- Only jumps allowed at the end of basic blocks.
174 | otherwise -> if isJumpishInstr i
175 then go True instr
176 else faultyBlockWith i
177 faultyBlockWith i
178 = pprPanic "Non control flow instructions after end of basic block."
179 (pprInstr platform i <+> text "in:" $$ vcat (map (pprInstr platform) instrs))
180
181 basicBlockCodeGen
182 :: CmmBlock
183 -> NatM ( [NatBasicBlock Instr]
184 , [NatCmmDecl (Alignment, RawCmmStatics) Instr])
185
186 basicBlockCodeGen block = do
187 let (_, nodes, tail) = blockSplit block
188 id = entryLabel block
189 stmts = blockToList nodes
190 -- Generate location directive
191 dbg <- getDebugBlock (entryLabel block)
192 loc_instrs <- case dblSourceTick =<< dbg of
193 Just (SourceNote span name)
194 -> do fileId <- getFileId (srcSpanFile span)
195 let line = srcSpanStartLine span; col = srcSpanStartCol span
196 return $ unitOL $ LOCATION fileId line col name
197 _ -> return nilOL
198 (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
199 (!tail_instrs,_) <- stmtToInstrs mid_bid tail
200 let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
201 platform <- getPlatform
202 return $! verifyBasicBlock platform (fromOL instrs)
203 instrs' <- fold <$> traverse addSpUnwindings instrs
204 -- code generation may introduce new basic block boundaries, which
205 -- are indicated by the NEWBLOCK instruction. We must split up the
206 -- instruction stream into basic blocks again. Also, we extract
207 -- LDATAs here too.
208 let
209 (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs'
210
211 mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
212 = ([], BasicBlock id instrs : blocks, statics)
213 mkBlocks (LDATA sec dat) (instrs,blocks,statics)
214 = (instrs, blocks, CmmData sec dat:statics)
215 mkBlocks instr (instrs,blocks,statics)
216 = (instr:instrs, blocks, statics)
217 return (BasicBlock id top : other_blocks, statics)
218
219 -- | Convert 'DELTA' instructions into 'UNWIND' instructions to capture changes
220 -- in the @sp@ register. See Note [What is this unwinding business?] in "GHC.Cmm.DebugBlock"
221 -- for details.
222 addSpUnwindings :: Instr -> NatM (OrdList Instr)
223 addSpUnwindings instr@(DELTA d) = do
224 config <- getConfig
225 if ncgDwarfUnwindings config
226 then do lbl <- mkAsmTempLabel <$> getUniqueM
227 let unwind = M.singleton MachSp (Just $ UwReg MachSp $ negate d)
228 return $ toOL [ instr, UNWIND lbl unwind ]
229 else return (unitOL instr)
230 addSpUnwindings instr = return $ unitOL instr
231
232 {- Note [Keeping track of the current block]
233 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
234
235 When generating instructions for Cmm we sometimes require
236 the current block for things like retry loops.
237
238 We also sometimes change the current block, if a MachOP
239 results in branching control flow.
240
241 Issues arise if we have two statements in the same block,
242 which both depend on the current block id *and* change the
243 basic block after them. This happens for atomic primops
244 in the X86 backend where we want to update the CFG data structure
245 when introducing new basic blocks.
246
247 For example in #17334 we got this Cmm code:
248
249 c3Bf: // global
250 (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18);
251 (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0);
252 _s3sT::I64 = _s3sV::I64;
253 goto c3B1;
254
255 This resulted in two new basic blocks being inserted:
256
257 c3Bf:
258 movl $18,%vI_n3Bo
259 movq 88(%vI_s3sQ),%rax
260 jmp _n3Bp
261 n3Bp:
262 ...
263 cmpxchgq %vI_n3Bq,88(%vI_s3sQ)
264 jne _n3Bp
265 ...
266 jmp _n3Bs
267 n3Bs:
268 ...
269 cmpxchgq %vI_n3Bt,88(%vI_s3sQ)
270 jne _n3Bs
271 ...
272 jmp _c3B1
273 ...
274
275 Based on the Cmm we called stmtToInstrs we translated both atomic operations under
276 the assumption they would be placed into their Cmm basic block `c3Bf`.
277 However for the retry loop we introduce new labels, so this is not the case
278 for the second statement.
279 This resulted in a desync between the explicit control flow graph
280 we construct as a separate data type and the actual control flow graph in the code.
281
282 Instead we now return the new basic block if a statement causes a change
283 in the current block and use the block for all following statements.
284
285 For this reason genCCall is also split into two parts. One for calls which
286 *won't* change the basic blocks in which successive instructions will be
287 placed (since they only evaluate CmmExpr, which can only contain MachOps, which
288 cannot introduce basic blocks in their lowerings). A different one for calls
289 which *are* known to change the basic block.
290
291 -}
292
293 -- See Note [Keeping track of the current block] for why
294 -- we pass the BlockId.
295 stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
296 -> [CmmNode O O] -- ^ Cmm Statement
297 -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
298 stmtsToInstrs bid stmts =
299 go bid stmts nilOL
300 where
301 go bid [] instrs = return (instrs,bid)
302 go bid (s:stmts) instrs = do
303 (instrs',bid') <- stmtToInstrs bid s
304 -- If the statement introduced a new block, we use that one
305 let !newBid = fromMaybe bid bid'
306 go newBid stmts (instrs `appOL` instrs')
307
308 -- | `bid` refers to the current block and is used to update the CFG
309 -- if new blocks are inserted in the control flow.
310 -- See Note [Keeping track of the current block] for more details.
311 stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
312 -> CmmNode e x
313 -> NatM (InstrBlock, Maybe BlockId)
314 -- ^ Instructions, and bid of new block if successive
315 -- statements are placed in a different basic block.
316 stmtToInstrs bid stmt = do
317 is32Bit <- is32BitPlatform
318 platform <- getPlatform
319 case stmt of
320 CmmUnsafeForeignCall target result_regs args
321 -> genCCall is32Bit target result_regs args bid
322
323 _ -> (,Nothing) <$> case stmt of
324 CmmComment s -> return (unitOL (COMMENT $ ftext s))
325 CmmTick {} -> return nilOL
326
327 CmmUnwind regs -> do
328 let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
329 to_unwind_entry (reg, expr) = M.singleton reg (fmap (toUnwindExpr platform) expr)
330 case foldMap to_unwind_entry regs of
331 tbl | M.null tbl -> return nilOL
332 | otherwise -> do
333 lbl <- mkAsmTempLabel <$> getUniqueM
334 return $ unitOL $ UNWIND lbl tbl
335
336 CmmAssign reg src
337 | isFloatType ty -> assignReg_FltCode format reg src
338 | is32Bit && isWord64 ty -> assignReg_I64Code reg src
339 | otherwise -> assignReg_IntCode format reg src
340 where ty = cmmRegType platform reg
341 format = cmmTypeFormat ty
342
343 CmmStore addr src
344 | isFloatType ty -> assignMem_FltCode format addr src
345 | is32Bit && isWord64 ty -> assignMem_I64Code addr src
346 | otherwise -> assignMem_IntCode format addr src
347 where ty = cmmExprType platform src
348 format = cmmTypeFormat ty
349
350 CmmBranch id -> return $ genBranch id
351
352 --We try to arrange blocks such that the likely branch is the fallthrough
353 --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
354 CmmCondBranch arg true false _ -> genCondBranch bid true false arg
355 CmmSwitch arg ids -> genSwitch arg ids
356 CmmCall { cml_target = arg
357 , cml_args_regs = gregs } -> genJump arg (jumpRegs platform gregs)
358 _ ->
359 panic "stmtToInstrs: statement should have been cps'd away"
360
361
362 jumpRegs :: Platform -> [GlobalReg] -> [Reg]
363 jumpRegs platform gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
364
365 --------------------------------------------------------------------------------
366 -- | 'InstrBlock's are the insn sequences generated by the insn selectors.
367 -- They are really trees of insns to facilitate fast appending, where a
368 -- left-to-right traversal yields the insns in the correct order.
369 --
370 type InstrBlock
371 = OrdList Instr
372
373
374 -- | Condition codes passed up the tree.
375 --
376 data CondCode
377 = CondCode Bool Cond InstrBlock
378
379
380 -- | a.k.a "Register64"
381 -- Reg is the lower 32-bit temporary which contains the result.
382 -- Use getHiVRegFromLo to find the other VRegUnique.
383 --
384 -- Rules of this simplified insn selection game are therefore that
385 -- the returned Reg may be modified
386 --
387 data ChildCode64
388 = ChildCode64
389 InstrBlock
390 Reg
391
392
393 -- | Register's passed up the tree. If the stix code forces the register
394 -- to live in a pre-decided machine register, it comes out as @Fixed@;
395 -- otherwise, it comes out as @Any@, and the parent can decide which
396 -- register to put it in.
397 --
398 data Register
399 = Fixed Format Reg InstrBlock
400 | Any Format (Reg -> InstrBlock)
401
402
403 swizzleRegisterRep :: Register -> Format -> Register
404 swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
405 swizzleRegisterRep (Any _ codefn) format = Any format codefn
406
407
408 -- | Grab the Reg for a CmmReg
409 getRegisterReg :: Platform -> CmmReg -> Reg
410
411 getRegisterReg _ (CmmLocal (LocalReg u pk))
412 = -- by Assuming SSE2, Int,Word,Float,Double all can be register allocated
413 let fmt = cmmTypeFormat pk in
414 RegVirtual (mkVirtualReg u fmt)
415
416 getRegisterReg platform (CmmGlobal mid)
417 = case globalRegMaybe platform mid of
418 Just reg -> RegReal $ reg
419 Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
420 -- By this stage, the only MagicIds remaining should be the
421 -- ones which map to a real machine register on this
422 -- platform. Hence ...
423
424
425 -- | Memory addressing modes passed up the tree.
426 data Amode
427 = Amode AddrMode InstrBlock
428
429 {-
430 Now, given a tree (the argument to a CmmLoad) that references memory,
431 produce a suitable addressing mode.
432
433 A Rule of the Game (tm) for Amodes: use of the addr bit must
434 immediately follow use of the code part, since the code part puts
435 values in registers which the addr then refers to. So you can't put
436 anything in between, lest it overwrite some of those registers. If
437 you need to do some other computation between the code part and use of
438 the addr bit, first store the effective address from the amode in a
439 temporary, then do the other computation, and then use the temporary:
440
441 code
442 LEA amode, tmp
443 ... other computation ...
444 ... (tmp) ...
445 -}
446
447
448 -- | Check whether an integer will fit in 32 bits.
449 -- A CmmInt is intended to be truncated to the appropriate
450 -- number of bits, so here we truncate it to Int64. This is
451 -- important because e.g. -1 as a CmmInt might be either
452 -- -1 or 18446744073709551615.
453 --
454 is32BitInteger :: Integer -> Bool
455 is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
456 where i64 = fromIntegral i :: Int64
457
458
459 -- | Convert a BlockId to some CmmStatic data
460 jumpTableEntry :: NCGConfig -> Maybe BlockId -> CmmStatic
461 jumpTableEntry config Nothing = CmmStaticLit (CmmInt 0 (ncgWordWidth config))
462 jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
463 where blockLabel = blockLbl blockid
464
465
466 -- -----------------------------------------------------------------------------
467 -- General things for putting together code sequences
468
469 -- Expand CmmRegOff. ToDo: should we do it this way around, or convert
470 -- CmmExprs into CmmRegOff?
471 mangleIndexTree :: Platform -> CmmReg -> Int -> CmmExpr
472 mangleIndexTree platform reg off
473 = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
474 where width = typeWidth (cmmRegType platform reg)
475
476 -- | The dual to getAnyReg: compute an expression into a register, but
477 -- we don't mind which one it is.
478 getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
479 getSomeReg expr = do
480 r <- getRegister expr
481 case r of
482 Any rep code -> do
483 tmp <- getNewRegNat rep
484 return (tmp, code tmp)
485 Fixed _ reg code ->
486 return (reg, code)
487
488
489 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
490 assignMem_I64Code addrTree valueTree = do
491 Amode addr addr_code <- getAmode addrTree
492 ChildCode64 vcode rlo <- iselExpr64 valueTree
493 let
494 rhi = getHiVRegFromLo rlo
495
496 -- Little-endian store
497 mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
498 mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
499 return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
500
501
502 assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock
503 assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
504 ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
505 let
506 r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
507 r_dst_hi = getHiVRegFromLo r_dst_lo
508 r_src_hi = getHiVRegFromLo r_src_lo
509 mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
510 mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
511 return (
512 vcode `snocOL` mov_lo `snocOL` mov_hi
513 )
514
515 assignReg_I64Code _ _
516 = panic "assignReg_I64Code(i386): invalid lvalue"
517
518
519 iselExpr64 :: CmmExpr -> NatM ChildCode64
520 iselExpr64 (CmmLit (CmmInt i _)) = do
521 (rlo,rhi) <- getNewRegPairNat II32
522 let
523 r = fromIntegral (fromIntegral i :: Word32)
524 q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
525 code = toOL [
526 MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
527 MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
528 ]
529 return (ChildCode64 code rlo)
530
531 iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
532 Amode addr addr_code <- getAmode addrTree
533 (rlo,rhi) <- getNewRegPairNat II32
534 let
535 mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
536 mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
537 return (
538 ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
539 rlo
540 )
541
542 iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
543 = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
544
545 -- we handle addition, but rather badly
546 iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
547 ChildCode64 code1 r1lo <- iselExpr64 e1
548 (rlo,rhi) <- getNewRegPairNat II32
549 let
550 r = fromIntegral (fromIntegral i :: Word32)
551 q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
552 r1hi = getHiVRegFromLo r1lo
553 code = code1 `appOL`
554 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
555 ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
556 MOV II32 (OpReg r1hi) (OpReg rhi),
557 ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
558 return (ChildCode64 code rlo)
559
560 iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
561 ChildCode64 code1 r1lo <- iselExpr64 e1
562 ChildCode64 code2 r2lo <- iselExpr64 e2
563 (rlo,rhi) <- getNewRegPairNat II32
564 let
565 r1hi = getHiVRegFromLo r1lo
566 r2hi = getHiVRegFromLo r2lo
567 code = code1 `appOL`
568 code2 `appOL`
569 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
570 ADD II32 (OpReg r2lo) (OpReg rlo),
571 MOV II32 (OpReg r1hi) (OpReg rhi),
572 ADC II32 (OpReg r2hi) (OpReg rhi) ]
573 return (ChildCode64 code rlo)
574
575 iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
576 ChildCode64 code1 r1lo <- iselExpr64 e1
577 ChildCode64 code2 r2lo <- iselExpr64 e2
578 (rlo,rhi) <- getNewRegPairNat II32
579 let
580 r1hi = getHiVRegFromLo r1lo
581 r2hi = getHiVRegFromLo r2lo
582 code = code1 `appOL`
583 code2 `appOL`
584 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
585 SUB II32 (OpReg r2lo) (OpReg rlo),
586 MOV II32 (OpReg r1hi) (OpReg rhi),
587 SBB II32 (OpReg r2hi) (OpReg rhi) ]
588 return (ChildCode64 code rlo)
589
590 iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
591 fn <- getAnyReg expr
592 r_dst_lo <- getNewRegNat II32
593 let r_dst_hi = getHiVRegFromLo r_dst_lo
594 code = fn r_dst_lo
595 return (
596 ChildCode64 (code `snocOL`
597 MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
598 r_dst_lo
599 )
600
601 iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
602 fn <- getAnyReg expr
603 r_dst_lo <- getNewRegNat II32
604 let r_dst_hi = getHiVRegFromLo r_dst_lo
605 code = fn r_dst_lo
606 return (
607 ChildCode64 (code `snocOL`
608 MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
609 CLTD II32 `snocOL`
610 MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
611 MOV II32 (OpReg edx) (OpReg r_dst_hi))
612 r_dst_lo
613 )
614
615 iselExpr64 expr
616 = do
617 platform <- getPlatform
618 pprPanic "iselExpr64(i386)" (pdoc platform expr)
619
620
621 --------------------------------------------------------------------------------
622 getRegister :: CmmExpr -> NatM Register
623 getRegister e = do platform <- getPlatform
624 is32Bit <- is32BitPlatform
625 getRegister' platform is32Bit e
626
627 getRegister' :: Platform -> Bool -> CmmExpr -> NatM Register
628
629 getRegister' platform is32Bit (CmmReg reg)
630 = case reg of
631 CmmGlobal PicBaseReg
632 | is32Bit ->
633 -- on x86_64, we have %rip for PicBaseReg, but it's not
634 -- a full-featured register, it can only be used for
635 -- rip-relative addressing.
636 do reg' <- getPicBaseNat (archWordFormat is32Bit)
637 return (Fixed (archWordFormat is32Bit) reg' nilOL)
638 _ ->
639 do
640 let
641 fmt = cmmTypeFormat (cmmRegType platform reg)
642 format = fmt
643 --
644 platform <- ncgPlatform <$> getConfig
645 return (Fixed format
646 (getRegisterReg platform reg)
647 nilOL)
648
649
650 getRegister' platform is32Bit (CmmRegOff r n)
651 = getRegister' platform is32Bit $ mangleIndexTree platform r n
652
653 getRegister' platform is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e])
654 = addAlignmentCheck align <$> getRegister' platform is32Bit e
655
656 -- for 32-bit architectures, support some 64 -> 32 bit conversions:
657 -- TO_W_(x), TO_W_(x >> 32)
658
659 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32)
660 [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
661 | is32Bit = do
662 ChildCode64 code rlo <- iselExpr64 x
663 return $ Fixed II32 (getHiVRegFromLo rlo) code
664
665 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32)
666 [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
667 | is32Bit = do
668 ChildCode64 code rlo <- iselExpr64 x
669 return $ Fixed II32 (getHiVRegFromLo rlo) code
670
671 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x])
672 | is32Bit = do
673 ChildCode64 code rlo <- iselExpr64 x
674 return $ Fixed II32 rlo code
675
676 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
677 | is32Bit = do
678 ChildCode64 code rlo <- iselExpr64 x
679 return $ Fixed II32 rlo code
680
681 getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
682 float_const_sse2 where
683 float_const_sse2
684 | f == 0.0 = do
685 let
686 format = floatFormat w
687 code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
688 -- I don't know why there are xorpd, xorps, and pxor instructions.
689 -- They all appear to do the same thing --SDM
690 return (Any format code)
691
692 | otherwise = do
693 Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
694 loadFloatAmode w addr code
695
696 -- catch simple cases of zero- or sign-extended load
697 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
698 code <- intLoadCode (MOVZxL II8) addr
699 return (Any II32 code)
700
701 getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
702 code <- intLoadCode (MOVSxL II8) addr
703 return (Any II32 code)
704
705 getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
706 code <- intLoadCode (MOVZxL II16) addr
707 return (Any II32 code)
708
709 getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
710 code <- intLoadCode (MOVSxL II16) addr
711 return (Any II32 code)
712
713 -- catch simple cases of zero- or sign-extended load
714 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _])
715 | not is32Bit = do
716 code <- intLoadCode (MOVZxL II8) addr
717 return (Any II64 code)
718
719 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _])
720 | not is32Bit = do
721 code <- intLoadCode (MOVSxL II8) addr
722 return (Any II64 code)
723
724 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _])
725 | not is32Bit = do
726 code <- intLoadCode (MOVZxL II16) addr
727 return (Any II64 code)
728
729 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _])
730 | not is32Bit = do
731 code <- intLoadCode (MOVSxL II16) addr
732 return (Any II64 code)
733
734 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _])
735 | not is32Bit = do
736 code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
737 return (Any II64 code)
738
739 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _])
740 | not is32Bit = do
741 code <- intLoadCode (MOVSxL II32) addr
742 return (Any II64 code)
743
744 getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
745 CmmLit displacement])
746 | not is32Bit =
747 return $ Any II64 (\dst -> unitOL $
748 LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
749
750 getRegister' platform is32Bit (CmmMachOp mop [x]) = -- unary MachOps
751 case mop of
752 MO_F_Neg w -> sse2NegCode w x
753
754
755 MO_S_Neg w -> triv_ucode NEGI (intFormat w)
756 MO_Not w -> triv_ucode NOT (intFormat w)
757
758 -- Nop conversions
759 MO_UU_Conv W32 W8 -> toI8Reg W32 x
760 MO_SS_Conv W32 W8 -> toI8Reg W32 x
761 MO_XX_Conv W32 W8 -> toI8Reg W32 x
762 MO_UU_Conv W16 W8 -> toI8Reg W16 x
763 MO_SS_Conv W16 W8 -> toI8Reg W16 x
764 MO_XX_Conv W16 W8 -> toI8Reg W16 x
765 MO_UU_Conv W32 W16 -> toI16Reg W32 x
766 MO_SS_Conv W32 W16 -> toI16Reg W32 x
767 MO_XX_Conv W32 W16 -> toI16Reg W32 x
768
769 MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
770 MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
771 MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
772 MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
773 MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
774 MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
775 MO_UU_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
776 MO_SS_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
777 MO_XX_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
778
779 MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
780 MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
781 MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
782
783 -- widenings
784 MO_UU_Conv W8 W32 -> integerExtend W8 W32 MOVZxL x
785 MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
786 MO_UU_Conv W8 W16 -> integerExtend W8 W16 MOVZxL x
787
788 MO_SS_Conv W8 W32 -> integerExtend W8 W32 MOVSxL x
789 MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
790 MO_SS_Conv W8 W16 -> integerExtend W8 W16 MOVSxL x
791
792 -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
793 -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
794 -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
795 MO_XX_Conv W8 W32
796 | is32Bit -> integerExtend W8 W32 MOVZxL x
797 | otherwise -> integerExtend W8 W32 MOV x
798 MO_XX_Conv W8 W16
799 | is32Bit -> integerExtend W8 W16 MOVZxL x
800 | otherwise -> integerExtend W8 W16 MOV x
801 MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
802
803 MO_UU_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVZxL x
804 MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
805 MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
806 MO_SS_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVSxL x
807 MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
808 MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
809 -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
810 -- However, we don't want the register allocator to throw it
811 -- away as an unnecessary reg-to-reg move, so we keep it in
812 -- the form of a movzl and print it as a movl later.
813 -- This doesn't apply to MO_XX_Conv since in this case we don't care about
814 -- the upper bits. So we can just use MOV.
815 MO_XX_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOV x
816 MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
817 MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
818
819 MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
820
821
822 MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
823
824 MO_FS_Conv from to -> coerceFP2Int from to x
825 MO_SF_Conv from to -> coerceInt2FP from to x
826
827 MO_V_Insert {} -> needLlvm
828 MO_V_Extract {} -> needLlvm
829 MO_V_Add {} -> needLlvm
830 MO_V_Sub {} -> needLlvm
831 MO_V_Mul {} -> needLlvm
832 MO_VS_Quot {} -> needLlvm
833 MO_VS_Rem {} -> needLlvm
834 MO_VS_Neg {} -> needLlvm
835 MO_VU_Quot {} -> needLlvm
836 MO_VU_Rem {} -> needLlvm
837 MO_VF_Insert {} -> needLlvm
838 MO_VF_Extract {} -> needLlvm
839 MO_VF_Add {} -> needLlvm
840 MO_VF_Sub {} -> needLlvm
841 MO_VF_Mul {} -> needLlvm
842 MO_VF_Quot {} -> needLlvm
843 MO_VF_Neg {} -> needLlvm
844
845 _other -> pprPanic "getRegister" (pprMachOp mop)
846 where
847 triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register
848 triv_ucode instr format = trivialUCode format (instr format) x
849
850 -- signed or unsigned extension.
851 integerExtend :: Width -> Width
852 -> (Format -> Operand -> Operand -> Instr)
853 -> CmmExpr -> NatM Register
854 integerExtend from to instr expr = do
855 (reg,e_code) <- if from == W8 then getByteReg expr
856 else getSomeReg expr
857 let
858 code dst =
859 e_code `snocOL`
860 instr (intFormat from) (OpReg reg) (OpReg dst)
861 return (Any (intFormat to) code)
862
863 toI8Reg :: Width -> CmmExpr -> NatM Register
864 toI8Reg new_rep expr
865 = do codefn <- getAnyReg expr
866 return (Any (intFormat new_rep) codefn)
867 -- HACK: use getAnyReg to get a byte-addressable register.
868 -- If the source was a Fixed register, this will add the
869 -- mov instruction to put it into the desired destination.
870 -- We're assuming that the destination won't be a fixed
871 -- non-byte-addressable register; it won't be, because all
872 -- fixed registers are word-sized.
873
874 toI16Reg = toI8Reg -- for now
875
876 conversionNop :: Format -> CmmExpr -> NatM Register
877 conversionNop new_format expr
878 = do e_code <- getRegister' platform is32Bit expr
879 return (swizzleRegisterRep e_code new_format)
880
881
882 getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
883 case mop of
884 MO_F_Eq _ -> condFltReg is32Bit EQQ x y
885 MO_F_Ne _ -> condFltReg is32Bit NE x y
886 MO_F_Gt _ -> condFltReg is32Bit GTT x y
887 MO_F_Ge _ -> condFltReg is32Bit GE x y
888 -- Invert comparison condition and swap operands
889 -- See Note [SSE Parity Checks]
890 MO_F_Lt _ -> condFltReg is32Bit GTT y x
891 MO_F_Le _ -> condFltReg is32Bit GE y x
892
893 MO_Eq _ -> condIntReg EQQ x y
894 MO_Ne _ -> condIntReg NE x y
895
896 MO_S_Gt _ -> condIntReg GTT x y
897 MO_S_Ge _ -> condIntReg GE x y
898 MO_S_Lt _ -> condIntReg LTT x y
899 MO_S_Le _ -> condIntReg LE x y
900
901 MO_U_Gt _ -> condIntReg GU x y
902 MO_U_Ge _ -> condIntReg GEU x y
903 MO_U_Lt _ -> condIntReg LU x y
904 MO_U_Le _ -> condIntReg LEU x y
905
906 MO_F_Add w -> trivialFCode_sse2 w ADD x y
907
908 MO_F_Sub w -> trivialFCode_sse2 w SUB x y
909
910 MO_F_Quot w -> trivialFCode_sse2 w FDIV x y
911
912 MO_F_Mul w -> trivialFCode_sse2 w MUL x y
913
914
915 MO_Add rep -> add_code rep x y
916 MO_Sub rep -> sub_code rep x y
917
918 MO_S_Quot rep -> div_code rep True True x y
919 MO_S_Rem rep -> div_code rep True False x y
920 MO_U_Quot rep -> div_code rep False True x y
921 MO_U_Rem rep -> div_code rep False False x y
922
923 MO_S_MulMayOflo rep -> imulMayOflo rep x y
924
925 MO_Mul W8 -> imulW8 x y
926 MO_Mul rep -> triv_op rep IMUL
927 MO_And rep -> triv_op rep AND
928 MO_Or rep -> triv_op rep OR
929 MO_Xor rep -> triv_op rep XOR
930
931 {- Shift ops on x86s have constraints on their source, it
932 either has to be Imm, CL or 1
933 => trivialCode is not restrictive enough (sigh.)
934 -}
935 MO_Shl rep -> shift_code rep SHL x y {-False-}
936 MO_U_Shr rep -> shift_code rep SHR x y {-False-}
937 MO_S_Shr rep -> shift_code rep SAR x y {-False-}
938
939 MO_V_Insert {} -> needLlvm
940 MO_V_Extract {} -> needLlvm
941 MO_V_Add {} -> needLlvm
942 MO_V_Sub {} -> needLlvm
943 MO_V_Mul {} -> needLlvm
944 MO_VS_Quot {} -> needLlvm
945 MO_VS_Rem {} -> needLlvm
946 MO_VS_Neg {} -> needLlvm
947 MO_VF_Insert {} -> needLlvm
948 MO_VF_Extract {} -> needLlvm
949 MO_VF_Add {} -> needLlvm
950 MO_VF_Sub {} -> needLlvm
951 MO_VF_Mul {} -> needLlvm
952 MO_VF_Quot {} -> needLlvm
953 MO_VF_Neg {} -> needLlvm
954
955 _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
956 where
957 --------------------
958 triv_op width instr = trivialCode width op (Just op) x y
959 where op = instr (intFormat width)
960
961 -- Special case for IMUL for bytes, since the result of IMULB will be in
962 -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
963 -- values.
964 imulW8 :: CmmExpr -> CmmExpr -> NatM Register
965 imulW8 arg_a arg_b = do
966 (a_reg, a_code) <- getNonClobberedReg arg_a
967 b_code <- getAnyReg arg_b
968
969 let code = a_code `appOL` b_code eax `appOL`
970 toOL [ IMUL2 format (OpReg a_reg) ]
971 format = intFormat W8
972
973 return (Fixed format eax code)
974
975
976 imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
977 imulMayOflo rep a b = do
978 (a_reg, a_code) <- getNonClobberedReg a
979 b_code <- getAnyReg b
980 let
981 shift_amt = case rep of
982 W32 -> 31
983 W64 -> 63
984 _ -> panic "shift_amt"
985
986 format = intFormat rep
987 code = a_code `appOL` b_code eax `appOL`
988 toOL [
989 IMUL2 format (OpReg a_reg), -- result in %edx:%eax
990 SAR format (OpImm (ImmInt shift_amt)) (OpReg eax),
991 -- sign extend lower part
992 SUB format (OpReg edx) (OpReg eax)
993 -- compare against upper
994 -- eax==0 if high part == sign extended low part
995 ]
996 return (Fixed format eax code)
997
998 --------------------
999 shift_code :: Width
1000 -> (Format -> Operand -> Operand -> Instr)
1001 -> CmmExpr
1002 -> CmmExpr
1003 -> NatM Register
1004
1005 {- Case1: shift length as immediate -}
1006 shift_code width instr x (CmmLit lit) = do
1007 x_code <- getAnyReg x
1008 let
1009 format = intFormat width
1010 code dst
1011 = x_code dst `snocOL`
1012 instr format (OpImm (litToImm lit)) (OpReg dst)
1013 return (Any format code)
1014
1015 {- Case2: shift length is complex (non-immediate)
1016 * y must go in %ecx.
1017 * we cannot do y first *and* put its result in %ecx, because
1018 %ecx might be clobbered by x.
1019 * if we do y second, then x cannot be
1020 in a clobbered reg. Also, we cannot clobber x's reg
1021 with the instruction itself.
1022 * so we can either:
1023 - do y first, put its result in a fresh tmp, then copy it to %ecx later
1024 - do y second and put its result into %ecx. x gets placed in a fresh
1025 tmp. This is likely to be better, because the reg alloc can
1026 eliminate this reg->reg move here (it won't eliminate the other one,
1027 because the move is into the fixed %ecx).
1028 * in the case of C calls the use of ecx here can interfere with arguments.
1029 We avoid this with the hack described in Note [Evaluate C-call
1030 arguments before placing in destination registers]
1031 -}
1032 shift_code width instr x y{-amount-} = do
1033 x_code <- getAnyReg x
1034 let format = intFormat width
1035 tmp <- getNewRegNat format
1036 y_code <- getAnyReg y
1037 let
1038 code = x_code tmp `appOL`
1039 y_code ecx `snocOL`
1040 instr format (OpReg ecx) (OpReg tmp)
1041 return (Fixed format tmp code)
1042
1043 --------------------
1044 add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
1045 add_code rep x (CmmLit (CmmInt y _))
1046 | is32BitInteger y
1047 , rep /= W8 -- LEA doesn't support byte size (#18614)
1048 = add_int rep x y
1049 add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y
1050 where format = intFormat rep
1051 -- TODO: There are other interesting patterns we want to replace
1052 -- with a LEA, e.g. `(x + offset) + (y << shift)`.
1053
1054 --------------------
1055 sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
1056 sub_code rep x (CmmLit (CmmInt y _))
1057 | is32BitInteger (-y)
1058 , rep /= W8 -- LEA doesn't support byte size (#18614)
1059 = add_int rep x (-y)
1060 sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y
1061
1062 -- our three-operand add instruction:
1063 add_int width x y = do
1064 (x_reg, x_code) <- getSomeReg x
1065 let
1066 format = intFormat width
1067 imm = ImmInt (fromInteger y)
1068 code dst
1069 = x_code `snocOL`
1070 LEA format
1071 (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
1072 (OpReg dst)
1073 --
1074 return (Any format code)
1075
1076 ----------------------
1077
1078 -- See Note [DIV/IDIV for bytes]
1079 div_code W8 signed quotient x y = do
1080 let widen | signed = MO_SS_Conv W8 W16
1081 | otherwise = MO_UU_Conv W8 W16
1082 div_code
1083 W16
1084 signed
1085 quotient
1086 (CmmMachOp widen [x])
1087 (CmmMachOp widen [y])
1088
1089 div_code width signed quotient x y = do
1090 (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
1091 x_code <- getAnyReg x
1092 let
1093 format = intFormat width
1094 widen | signed = CLTD format
1095 | otherwise = XOR format (OpReg edx) (OpReg edx)
1096
1097 instr | signed = IDIV
1098 | otherwise = DIV
1099
1100 code = y_code `appOL`
1101 x_code eax `appOL`
1102 toOL [widen, instr format y_op]
1103
1104 result | quotient = eax
1105 | otherwise = edx
1106
1107 return (Fixed format result code)
1108
1109
1110 getRegister' _ _ (CmmLoad mem pk)
1111 | isFloatType pk
1112 = do
1113 Amode addr mem_code <- getAmode mem
1114 loadFloatAmode (typeWidth pk) addr mem_code
1115
1116 getRegister' _ is32Bit (CmmLoad mem pk)
1117 | is32Bit && not (isWord64 pk)
1118 = do
1119 code <- intLoadCode instr mem
1120 return (Any format code)
1121 where
1122 width = typeWidth pk
1123 format = intFormat width
1124 instr = case width of
1125 W8 -> MOVZxL II8
1126 _other -> MOV format
1127 -- We always zero-extend 8-bit loads, if we
1128 -- can't think of anything better. This is because
1129 -- we can't guarantee access to an 8-bit variant of every register
1130 -- (esi and edi don't have 8-bit variants), so to make things
1131 -- simpler we do our 8-bit arithmetic with full 32-bit registers.
1132
1133 -- Simpler memory load code on x86_64
1134 getRegister' _ is32Bit (CmmLoad mem pk)
1135 | not is32Bit
1136 = do
1137 code <- intLoadCode (MOV format) mem
1138 return (Any format code)
1139 where format = intFormat $ typeWidth pk
1140
1141 getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
1142 = let
1143 format = intFormat width
1144
1145 -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
1146 format1 = if is32Bit then format
1147 else case format of
1148 II64 -> II32
1149 _ -> format
1150 code dst
1151 = unitOL (XOR format1 (OpReg dst) (OpReg dst))
1152 in
1153 return (Any format code)
1154
1155 -- optimisation for loading small literals on x86_64: take advantage
1156 -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
1157 -- instruction forms are shorter.
1158 getRegister' platform is32Bit (CmmLit lit)
1159 | not is32Bit, isWord64 (cmmLitType platform lit), not (isBigLit lit)
1160 = let
1161 imm = litToImm lit
1162 code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
1163 in
1164 return (Any II64 code)
1165 where
1166 isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
1167 isBigLit _ = False
1168 -- note1: not the same as (not.is32BitLit), because that checks for
1169 -- signed literals that fit in 32 bits, but we want unsigned
1170 -- literals here.
1171 -- note2: all labels are small, because we're assuming the
1172 -- small memory model (see gcc docs, -mcmodel=small).
1173
1174 getRegister' platform _ (CmmLit lit)
1175 = do let format = cmmTypeFormat (cmmLitType platform lit)
1176 imm = litToImm lit
1177 code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
1178 return (Any format code)
1179
1180 getRegister' platform _ other
1181 | isVecExpr other = needLlvm
1182 | otherwise = pprPanic "getRegister(x86)" (pdoc platform other)
1183
1184
1185 intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
1186 -> NatM (Reg -> InstrBlock)
1187 intLoadCode instr mem = do
1188 Amode src mem_code <- getAmode mem
1189 return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
1190
1191 -- Compute an expression into *any* register, adding the appropriate
1192 -- move instruction if necessary.
1193 getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
1194 getAnyReg expr = do
1195 r <- getRegister expr
1196 anyReg r
1197
1198 anyReg :: Register -> NatM (Reg -> InstrBlock)
1199 anyReg (Any _ code) = return code
1200 anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
1201
1202 -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
1203 -- Fixed registers might not be byte-addressable, so we make sure we've
1204 -- got a temporary, inserting an extra reg copy if necessary.
1205 getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
1206 getByteReg expr = do
1207 is32Bit <- is32BitPlatform
1208 if is32Bit
1209 then do r <- getRegister expr
1210 case r of
1211 Any rep code -> do
1212 tmp <- getNewRegNat rep
1213 return (tmp, code tmp)
1214 Fixed rep reg code
1215 | isVirtualReg reg -> return (reg,code)
1216 | otherwise -> do
1217 tmp <- getNewRegNat rep
1218 return (tmp, code `snocOL` reg2reg rep reg tmp)
1219 -- ToDo: could optimise slightly by checking for
1220 -- byte-addressable real registers, but that will
1221 -- happen very rarely if at all.
1222 else getSomeReg expr -- all regs are byte-addressable on x86_64
1223
1224 -- Another variant: this time we want the result in a register that cannot
1225 -- be modified by code to evaluate an arbitrary expression.
1226 getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
1227 getNonClobberedReg expr = do
1228 r <- getRegister expr
1229 platform <- ncgPlatform <$> getConfig
1230 case r of
1231 Any rep code -> do
1232 tmp <- getNewRegNat rep
1233 return (tmp, code tmp)
1234 Fixed rep reg code
1235 -- only certain regs can be clobbered
1236 | reg `elem` instrClobberedRegs platform
1237 -> do
1238 tmp <- getNewRegNat rep
1239 return (tmp, code `snocOL` reg2reg rep reg tmp)
1240 | otherwise ->
1241 return (reg, code)
1242
1243 reg2reg :: Format -> Reg -> Reg -> Instr
1244 reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
1245
1246
1247 --------------------------------------------------------------------------------
1248
1249 -- | Convert a 'CmmExpr' representing a memory address into an 'Amode'.
1250 --
1251 -- An 'Amode' is a datatype representing a valid address form for the target
1252 -- (e.g. "Base + Index + disp" or immediate) and the code to compute it.
1253 getAmode :: CmmExpr -> NatM Amode
1254 getAmode e = do
1255 platform <- getPlatform
1256 let is32Bit = target32Bit platform
1257
1258 case e of
1259 CmmRegOff r n
1260 -> getAmode $ mangleIndexTree platform r n
1261
1262 CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg), CmmLit displacement]
1263 | not is32Bit
1264 -> return $ Amode (ripRel (litToImm displacement)) nilOL
1265
1266 -- This is all just ridiculous, since it carefully undoes
1267 -- what mangleIndexTree has just done.
1268 CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)]
1269 | is32BitLit is32Bit lit
1270 -- assert (rep == II32)???
1271 -> do
1272 (x_reg, x_code) <- getSomeReg x
1273 let off = ImmInt (-(fromInteger i))
1274 return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
1275
1276 CmmMachOp (MO_Add _rep) [x, CmmLit lit]
1277 | is32BitLit is32Bit lit
1278 -- assert (rep == II32)???
1279 -> do
1280 (x_reg, x_code) <- getSomeReg x
1281 let off = litToImm lit
1282 return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
1283
1284 -- Turn (lit1 << n + lit2) into (lit2 + lit1 << n) so it will be
1285 -- recognised by the next rule.
1286 CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _), b@(CmmLit _)]
1287 -> getAmode (CmmMachOp (MO_Add rep) [b,a])
1288
1289 -- Matches: (x + offset) + (y << shift)
1290 CmmMachOp (MO_Add _) [CmmRegOff x offset, CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)]]
1291 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1292 -> x86_complex_amode (CmmReg x) y shift (fromIntegral offset)
1293
1294 CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)]]
1295 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1296 -> x86_complex_amode x y shift 0
1297
1298 CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Add _) [CmmMachOp (MO_Shl _)
1299 [y, CmmLit (CmmInt shift _)], CmmLit (CmmInt offset _)]]
1300 | shift == 0 || shift == 1 || shift == 2 || shift == 3
1301 && is32BitInteger offset
1302 -> x86_complex_amode x y shift offset
1303
1304 CmmMachOp (MO_Add _) [x,y]
1305 | not (isLit y) -- we already handle valid literals above.
1306 -> x86_complex_amode x y 0 0
1307
1308 CmmLit lit
1309 | is32BitLit is32Bit lit
1310 -> return (Amode (ImmAddr (litToImm lit) 0) nilOL)
1311
1312 -- Literal with offsets too big (> 32 bits) fails during the linking phase
1313 -- (#15570). We already handled valid literals above so we don't have to
1314 -- test anything here.
1315 CmmLit (CmmLabelOff l off)
1316 -> getAmode (CmmMachOp (MO_Add W64) [ CmmLit (CmmLabel l)
1317 , CmmLit (CmmInt (fromIntegral off) W64)
1318 ])
1319 CmmLit (CmmLabelDiffOff l1 l2 off w)
1320 -> getAmode (CmmMachOp (MO_Add W64) [ CmmLit (CmmLabelDiffOff l1 l2 0 w)
1321 , CmmLit (CmmInt (fromIntegral off) W64)
1322 ])
1323
1324 -- in case we can't do something better, we just compute the expression
1325 -- and put the result in a register
1326 _ -> do
1327 (reg,code) <- getSomeReg e
1328 return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
1329
1330
1331
1332 -- | Like 'getAmode', but on 32-bit use simple register addressing
1333 -- (i.e. no index register). This stops us from running out of
1334 -- registers on x86 when using instructions such as cmpxchg, which can
1335 -- use up to three virtual registers and one fixed register.
1336 getSimpleAmode :: Bool -> CmmExpr -> NatM Amode
1337 getSimpleAmode is32Bit addr
1338 | is32Bit = do
1339 addr_code <- getAnyReg addr
1340 config <- getConfig
1341 addr_r <- getNewRegNat (intFormat (ncgWordWidth config))
1342 let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0)
1343 return $! Amode amode (addr_code addr_r)
1344 | otherwise = getAmode addr
1345
1346 x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
1347 x86_complex_amode base index shift offset
1348 = do (x_reg, x_code) <- getNonClobberedReg base
1349 -- x must be in a temp, because it has to stay live over y_code
1350 -- we could compare x_reg and y_reg and do something better here...
1351 (y_reg, y_code) <- getSomeReg index
1352 let
1353 code = x_code `appOL` y_code
1354 base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
1355 n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
1356 return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
1357 code)
1358
1359
1360
1361
1362 -- -----------------------------------------------------------------------------
1363 -- getOperand: sometimes any operand will do.
1364
1365 -- getNonClobberedOperand: the value of the operand will remain valid across
1366 -- the computation of an arbitrary expression, unless the expression
1367 -- is computed directly into a register which the operand refers to
1368 -- (see trivialCode where this function is used for an example).
1369
1370 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1371 getNonClobberedOperand (CmmLit lit) =
1372 if isSuitableFloatingPointLit lit
1373 then do
1374 let CmmFloat _ w = lit
1375 Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
1376 return (OpAddr addr, code)
1377 else do
1378 is32Bit <- is32BitPlatform
1379 platform <- getPlatform
1380 if is32BitLit is32Bit lit && not (isFloatType (cmmLitType platform lit))
1381 then return (OpImm (litToImm lit), nilOL)
1382 else getNonClobberedOperand_generic (CmmLit lit)
1383
1384 getNonClobberedOperand (CmmLoad mem pk) = do
1385 is32Bit <- is32BitPlatform
1386 -- this logic could be simplified
1387 -- TODO FIXME
1388 if (if is32Bit then not (isWord64 pk) else True)
1389 -- if 32bit and pk is at float/double/simd value
1390 -- or if 64bit
1391 -- this could use some eyeballs or i'll need to stare at it more later
1392 then do
1393 platform <- ncgPlatform <$> getConfig
1394 Amode src mem_code <- getAmode mem
1395 (src',save_code) <-
1396 if (amodeCouldBeClobbered platform src)
1397 then do
1398 tmp <- getNewRegNat (archWordFormat is32Bit)
1399 return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
1400 unitOL (LEA (archWordFormat is32Bit)
1401 (OpAddr src)
1402 (OpReg tmp)))
1403 else
1404 return (src, nilOL)
1405 return (OpAddr src', mem_code `appOL` save_code)
1406 else
1407 -- if its a word or gcptr on 32bit?
1408 getNonClobberedOperand_generic (CmmLoad mem pk)
1409
1410 getNonClobberedOperand e = getNonClobberedOperand_generic e
1411
1412 getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1413 getNonClobberedOperand_generic e = do
1414 (reg, code) <- getNonClobberedReg e
1415 return (OpReg reg, code)
1416
1417 amodeCouldBeClobbered :: Platform -> AddrMode -> Bool
1418 amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode)
1419
1420 regClobbered :: Platform -> Reg -> Bool
1421 regClobbered platform (RegReal (RealRegSingle rr)) = freeReg platform rr
1422 regClobbered _ _ = False
1423
1424 -- getOperand: the operand is not required to remain valid across the
1425 -- computation of an arbitrary expression.
1426 getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1427
1428 getOperand (CmmLit lit) = do
1429 use_sse2 <- sse2Enabled
1430 if (use_sse2 && isSuitableFloatingPointLit lit)
1431 then do
1432 let CmmFloat _ w = lit
1433 Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
1434 return (OpAddr addr, code)
1435 else do
1436
1437 is32Bit <- is32BitPlatform
1438 platform <- getPlatform
1439 if is32BitLit is32Bit lit && not (isFloatType (cmmLitType platform lit))
1440 then return (OpImm (litToImm lit), nilOL)
1441 else getOperand_generic (CmmLit lit)
1442
1443 getOperand (CmmLoad mem pk) = do
1444 is32Bit <- is32BitPlatform
1445 use_sse2 <- sse2Enabled
1446 if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
1447 then do
1448 Amode src mem_code <- getAmode mem
1449 return (OpAddr src, mem_code)
1450 else
1451 getOperand_generic (CmmLoad mem pk)
1452
1453 getOperand e = getOperand_generic e
1454
1455 getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1456 getOperand_generic e = do
1457 (reg, code) <- getSomeReg e
1458 return (OpReg reg, code)
1459
1460 isOperand :: Bool -> CmmExpr -> Bool
1461 isOperand _ (CmmLoad _ _) = True
1462 isOperand is32Bit (CmmLit lit) = is32BitLit is32Bit lit
1463 || isSuitableFloatingPointLit lit
1464 isOperand _ _ = False
1465
1466 -- | Given a 'Register', produce a new 'Register' with an instruction block
1467 -- which will check the value for alignment. Used for @-falignment-sanitisation@.
1468 addAlignmentCheck :: Int -> Register -> Register
1469 addAlignmentCheck align reg =
1470 case reg of
1471 Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg)
1472 Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt reg)
1473 where
1474 check :: Format -> Reg -> InstrBlock
1475 check fmt reg =
1476 assert (not $ isFloatFormat fmt) $
1477 toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
1478 , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
1479 ]
1480
1481 memConstant :: Alignment -> CmmLit -> NatM Amode
1482 memConstant align lit = do
1483 lbl <- getNewLabelNat
1484 let rosection = Section ReadOnlyData lbl
1485 config <- getConfig
1486 platform <- getPlatform
1487 (addr, addr_code) <- if target32Bit platform
1488 then do dynRef <- cmmMakeDynamicReference
1489 config
1490 DataReference
1491 lbl
1492 Amode addr addr_code <- getAmode dynRef
1493 return (addr, addr_code)
1494 else return (ripRel (ImmCLbl lbl), nilOL)
1495 let code =
1496 LDATA rosection (align, CmmStaticsRaw lbl [CmmStaticLit lit])
1497 `consOL` addr_code
1498 return (Amode addr code)
1499
1500
1501 loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
1502 loadFloatAmode w addr addr_code = do
1503 let format = floatFormat w
1504 code dst = addr_code `snocOL`
1505 MOV format (OpAddr addr) (OpReg dst)
1506
1507 return (Any format code)
1508
1509
1510 -- if we want a floating-point literal as an operand, we can
1511 -- use it directly from memory. However, if the literal is
1512 -- zero, we're better off generating it into a register using
1513 -- xor.
1514 isSuitableFloatingPointLit :: CmmLit -> Bool
1515 isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
1516 isSuitableFloatingPointLit _ = False
1517
1518 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
1519 getRegOrMem e@(CmmLoad mem pk) = do
1520 is32Bit <- is32BitPlatform
1521 use_sse2 <- sse2Enabled
1522 if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
1523 then do
1524 Amode src mem_code <- getAmode mem
1525 return (OpAddr src, mem_code)
1526 else do
1527 (reg, code) <- getNonClobberedReg e
1528 return (OpReg reg, code)
1529 getRegOrMem e = do
1530 (reg, code) <- getNonClobberedReg e
1531 return (OpReg reg, code)
1532
1533 is32BitLit :: Bool -> CmmLit -> Bool
1534 is32BitLit is32Bit lit
1535 | not is32Bit = case lit of
1536 CmmInt i W64 -> is32BitInteger i
1537 -- assume that labels are in the range 0-2^31-1: this assumes the
1538 -- small memory model (see gcc docs, -mcmodel=small).
1539 CmmLabel _ -> True
1540 -- however we can't assume that label offsets are in this range
1541 -- (see #15570)
1542 CmmLabelOff _ off -> is32BitInteger (fromIntegral off)
1543 CmmLabelDiffOff _ _ off _ -> is32BitInteger (fromIntegral off)
1544 _ -> True
1545 is32BitLit _ _ = True
1546
1547
1548
1549
1550 -- Set up a condition code for a conditional branch.
1551
1552 getCondCode :: CmmExpr -> NatM CondCode
1553
1554 -- yes, they really do seem to want exactly the same!
1555
1556 getCondCode (CmmMachOp mop [x, y])
1557 =
1558 case mop of
1559 MO_F_Eq W32 -> condFltCode EQQ x y
1560 MO_F_Ne W32 -> condFltCode NE x y
1561 MO_F_Gt W32 -> condFltCode GTT x y
1562 MO_F_Ge W32 -> condFltCode GE x y
1563 -- Invert comparison condition and swap operands
1564 -- See Note [SSE Parity Checks]
1565 MO_F_Lt W32 -> condFltCode GTT y x
1566 MO_F_Le W32 -> condFltCode GE y x
1567
1568 MO_F_Eq W64 -> condFltCode EQQ x y
1569 MO_F_Ne W64 -> condFltCode NE x y
1570 MO_F_Gt W64 -> condFltCode GTT x y
1571 MO_F_Ge W64 -> condFltCode GE x y
1572 MO_F_Lt W64 -> condFltCode GTT y x
1573 MO_F_Le W64 -> condFltCode GE y x
1574
1575 _ -> condIntCode (machOpToCond mop) x y
1576
1577 getCondCode other = do
1578 platform <- getPlatform
1579 pprPanic "getCondCode(2)(x86,x86_64)" (pdoc platform other)
1580
1581 machOpToCond :: MachOp -> Cond
1582 machOpToCond mo = case mo of
1583 MO_Eq _ -> EQQ
1584 MO_Ne _ -> NE
1585 MO_S_Gt _ -> GTT
1586 MO_S_Ge _ -> GE
1587 MO_S_Lt _ -> LTT
1588 MO_S_Le _ -> LE
1589 MO_U_Gt _ -> GU
1590 MO_U_Ge _ -> GEU
1591 MO_U_Lt _ -> LU
1592 MO_U_Le _ -> LEU
1593 _other -> pprPanic "machOpToCond" (pprMachOp mo)
1594
1595
1596 -- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
1597 -- passed back up the tree.
1598
1599 condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1600 condIntCode cond x y = do is32Bit <- is32BitPlatform
1601 condIntCode' is32Bit cond x y
1602
1603 condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1604
1605 -- memory vs immediate
1606 condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit)
1607 | is32BitLit is32Bit lit = do
1608 Amode x_addr x_code <- getAmode x
1609 let
1610 imm = litToImm lit
1611 code = x_code `snocOL`
1612 CMP (cmmTypeFormat pk) (OpImm imm) (OpAddr x_addr)
1613 --
1614 return (CondCode False cond code)
1615
1616 -- anything vs zero, using a mask
1617 -- TODO: Add some sanity checking!!!!
1618 condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
1619 | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit
1620 = do
1621 (x_reg, x_code) <- getSomeReg x
1622 let
1623 code = x_code `snocOL`
1624 TEST (intFormat pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
1625 --
1626 return (CondCode False cond code)
1627
1628 -- anything vs zero
1629 condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do
1630 (x_reg, x_code) <- getSomeReg x
1631 let
1632 code = x_code `snocOL`
1633 TEST (intFormat pk) (OpReg x_reg) (OpReg x_reg)
1634 --
1635 return (CondCode False cond code)
1636
1637 -- anything vs operand
1638 condIntCode' is32Bit cond x y
1639 | isOperand is32Bit y = do
1640 platform <- getPlatform
1641 (x_reg, x_code) <- getNonClobberedReg x
1642 (y_op, y_code) <- getOperand y
1643 let
1644 code = x_code `appOL` y_code `snocOL`
1645 CMP (cmmTypeFormat (cmmExprType platform x)) y_op (OpReg x_reg)
1646 return (CondCode False cond code)
1647 -- operand vs. anything: invert the comparison so that we can use a
1648 -- single comparison instruction.
1649 | isOperand is32Bit x
1650 , Just revcond <- maybeFlipCond cond = do
1651 platform <- getPlatform
1652 (y_reg, y_code) <- getNonClobberedReg y
1653 (x_op, x_code) <- getOperand x
1654 let
1655 code = y_code `appOL` x_code `snocOL`
1656 CMP (cmmTypeFormat (cmmExprType platform x)) x_op (OpReg y_reg)
1657 return (CondCode False revcond code)
1658
1659 -- anything vs anything
1660 condIntCode' _ cond x y = do
1661 platform <- getPlatform
1662 (y_reg, y_code) <- getNonClobberedReg y
1663 (x_op, x_code) <- getRegOrMem x
1664 let
1665 code = y_code `appOL`
1666 x_code `snocOL`
1667 CMP (cmmTypeFormat (cmmExprType platform x)) (OpReg y_reg) x_op
1668 return (CondCode False cond code)
1669
1670
1671
1672 --------------------------------------------------------------------------------
1673 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1674
1675 condFltCode cond x y
1676 = condFltCode_sse2
1677 where
1678
1679
1680 -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
1681 -- an operand, but the right must be a reg. We can probably do better
1682 -- than this general case...
1683 condFltCode_sse2 = do
1684 platform <- getPlatform
1685 (x_reg, x_code) <- getNonClobberedReg x
1686 (y_op, y_code) <- getOperand y
1687 let
1688 code = x_code `appOL`
1689 y_code `snocOL`
1690 CMP (floatFormat $ cmmExprWidth platform x) y_op (OpReg x_reg)
1691 -- NB(1): we need to use the unsigned comparison operators on the
1692 -- result of this comparison.
1693 return (CondCode True (condToUnsigned cond) code)
1694
1695 -- -----------------------------------------------------------------------------
1696 -- Generating assignments
1697
1698 -- Assignments are really at the heart of the whole code generation
1699 -- business. Almost all top-level nodes of any real importance are
1700 -- assignments, which correspond to loads, stores, or register
1701 -- transfers. If we're really lucky, some of the register transfers
1702 -- will go away, because we can use the destination register to
1703 -- complete the code generation for the right hand side. This only
1704 -- fails when the right hand side is forced into a fixed register
1705 -- (e.g. the result of a call).
1706
1707 assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
1708 assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
1709
1710 assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
1711 assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
1712
1713
1714 -- integer assignment to memory
1715
1716 -- specific case of adding/subtracting an integer to a particular address.
1717 -- ToDo: catch other cases where we can use an operation directly on a memory
1718 -- address.
1719 assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
1720 CmmLit (CmmInt i _)])
1721 | addr == addr2, pk /= II64 || is32BitInteger i,
1722 Just instr <- check op
1723 = do Amode amode code_addr <- getAmode addr
1724 let code = code_addr `snocOL`
1725 instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
1726 return code
1727 where
1728 check (MO_Add _) = Just ADD
1729 check (MO_Sub _) = Just SUB
1730 check _ = Nothing
1731 -- ToDo: more?
1732
1733 -- general case
1734 assignMem_IntCode pk addr src = do
1735 is32Bit <- is32BitPlatform
1736 Amode addr code_addr <- getAmode addr
1737 (code_src, op_src) <- get_op_RI is32Bit src
1738 let
1739 code = code_src `appOL`
1740 code_addr `snocOL`
1741 MOV pk op_src (OpAddr addr)
1742 -- NOTE: op_src is stable, so it will still be valid
1743 -- after code_addr. This may involve the introduction
1744 -- of an extra MOV to a temporary register, but we hope
1745 -- the register allocator will get rid of it.
1746 --
1747 return code
1748 where
1749 get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand) -- code, operator
1750 get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit
1751 = return (nilOL, OpImm (litToImm lit))
1752 get_op_RI _ op
1753 = do (reg,code) <- getNonClobberedReg op
1754 return (code, OpReg reg)
1755
1756
1757 -- Assign; dst is a reg, rhs is mem
1758 assignReg_IntCode pk reg (CmmLoad src _) = do
1759 load_code <- intLoadCode (MOV pk) src
1760 platform <- ncgPlatform <$> getConfig
1761 return (load_code (getRegisterReg platform reg))
1762
1763 -- dst is a reg, but src could be anything
1764 assignReg_IntCode _ reg src = do
1765 platform <- ncgPlatform <$> getConfig
1766 code <- getAnyReg src
1767 return (code (getRegisterReg platform reg))
1768
1769
1770 -- Floating point assignment to memory
1771 assignMem_FltCode pk addr src = do
1772 (src_reg, src_code) <- getNonClobberedReg src
1773 Amode addr addr_code <- getAmode addr
1774 let
1775 code = src_code `appOL`
1776 addr_code `snocOL`
1777 MOV pk (OpReg src_reg) (OpAddr addr)
1778
1779 return code
1780
1781 -- Floating point assignment to a register/temporary
1782 assignReg_FltCode _ reg src = do
1783 src_code <- getAnyReg src
1784 platform <- ncgPlatform <$> getConfig
1785 return (src_code (getRegisterReg platform reg))
1786
1787
1788 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
1789
1790 genJump (CmmLoad mem _) regs = do
1791 Amode target code <- getAmode mem
1792 return (code `snocOL` JMP (OpAddr target) regs)
1793
1794 genJump (CmmLit lit) regs =
1795 return (unitOL (JMP (OpImm (litToImm lit)) regs))
1796
1797 genJump expr regs = do
1798 (reg,code) <- getSomeReg expr
1799 return (code `snocOL` JMP (OpReg reg) regs)
1800
1801
1802 -- -----------------------------------------------------------------------------
1803 -- Unconditional branches
1804
1805 genBranch :: BlockId -> InstrBlock
1806 genBranch = toOL . mkJumpInstr
1807
1808
1809
1810 -- -----------------------------------------------------------------------------
1811 -- Conditional jumps/branches
1812
1813 {-
1814 Conditional jumps are always to local labels, so we can use branch
1815 instructions. We peek at the arguments to decide what kind of
1816 comparison to do.
1817
1818 I386: First, we have to ensure that the condition
1819 codes are set according to the supplied comparison operation.
1820 -}
1821
1822 {- Note [64-bit integer comparisons on 32-bit]
1823 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1824
1825 When doing these comparisons there are 2 kinds of
1826 comparisons.
1827
1828 * Comparison for equality (or lack thereof)
1829
1830 We use xor to check if high/low bits are
1831 equal. Then combine the results using or and
1832 perform a single conditional jump based on the
1833 result.
1834
1835 * Other comparisons:
1836
1837 We map all other comparisons to the >= operation.
1838 Why? Because it's easy to encode it with a single
1839 conditional jump.
1840
1841 We do this by first computing [r1_lo - r2_lo]
1842 and use the carry flag to compute
1843 [r1_high - r2_high - CF].
1844
1845 At which point if r1 >= r2 then the result will be
1846 positive. Otherwise negative so we can branch on this
1847 condition.
1848
1849 -}
1850
1851
1852 genCondBranch
1853 :: BlockId -- the source of the jump
1854 -> BlockId -- the true branch target
1855 -> BlockId -- the false branch target
1856 -> CmmExpr -- the condition on which to branch
1857 -> NatM InstrBlock -- Instructions
1858
1859 genCondBranch bid id false expr = do
1860 is32Bit <- is32BitPlatform
1861 genCondBranch' is32Bit bid id false expr
1862
1863 -- | We return the instructions generated.
1864 genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr
1865 -> NatM InstrBlock
1866
1867 -- 64-bit integer comparisons on 32-bit
1868 -- See Note [64-bit integer comparisons on 32-bit]
1869 genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2])
1870 | is32Bit, Just W64 <- maybeIntComparison mop = do
1871
1872 -- The resulting registers here are both the lower part of
1873 -- the register as well as a way to get at the higher part.
1874 ChildCode64 code1 r1 <- iselExpr64 e1
1875 ChildCode64 code2 r2 <- iselExpr64 e2
1876 let cond = machOpToCond mop :: Cond
1877
1878 let cmpCode = intComparison cond true false r1 r2
1879 return $ code1 `appOL` code2 `appOL` cmpCode
1880
1881 where
1882 intComparison :: Cond -> BlockId -> BlockId -> Reg -> Reg -> InstrBlock
1883 intComparison cond true false r1_lo r2_lo =
1884 case cond of
1885 -- Impossible results of machOpToCond
1886 ALWAYS -> panic "impossible"
1887 NEG -> panic "impossible"
1888 POS -> panic "impossible"
1889 CARRY -> panic "impossible"
1890 OFLO -> panic "impossible"
1891 PARITY -> panic "impossible"
1892 NOTPARITY -> panic "impossible"
1893 -- Special case #1 x == y and x != y
1894 EQQ -> cmpExact
1895 NE -> cmpExact
1896 -- [x >= y]
1897 GE -> cmpGE
1898 GEU -> cmpGE
1899 -- [x > y] <==> ![y >= x]
1900 GTT -> intComparison GE false true r2_lo r1_lo
1901 GU -> intComparison GEU false true r2_lo r1_lo
1902 -- [x <= y] <==> [y >= x]
1903 LE -> intComparison GE true false r2_lo r1_lo
1904 LEU -> intComparison GEU true false r2_lo r1_lo
1905 -- [x < y] <==> ![x >= x]
1906 LTT -> intComparison GE false true r1_lo r2_lo
1907 LU -> intComparison GEU false true r1_lo r2_lo
1908 where
1909 r1_hi = getHiVRegFromLo r1_lo
1910 r2_hi = getHiVRegFromLo r2_lo
1911 cmpExact :: OrdList Instr
1912 cmpExact =
1913 toOL
1914 [ XOR II32 (OpReg r2_hi) (OpReg r1_hi)
1915 , XOR II32 (OpReg r2_lo) (OpReg r1_lo)
1916 , OR II32 (OpReg r1_hi) (OpReg r1_lo)
1917 , JXX cond true
1918 , JXX ALWAYS false
1919 ]
1920 cmpGE = toOL
1921 [ CMP II32 (OpReg r2_lo) (OpReg r1_lo)
1922 , SBB II32 (OpReg r2_hi) (OpReg r1_hi)
1923 , JXX cond true
1924 , JXX ALWAYS false ]
1925
1926 genCondBranch' _ bid id false bool = do
1927 CondCode is_float cond cond_code <- getCondCode bool
1928 use_sse2 <- sse2Enabled
1929 if not is_float || not use_sse2
1930 then
1931 return (cond_code `snocOL` JXX cond id `appOL` genBranch false)
1932 else do
1933 -- See Note [SSE Parity Checks]
1934 let jmpFalse = genBranch false
1935 code
1936 = case cond of
1937 NE -> or_unordered
1938 GU -> plain_test
1939 GEU -> plain_test
1940 -- Use ASSERT so we don't break releases if
1941 -- LTT/LE creep in somehow.
1942 LTT ->
1943 assertPpr False (ppr "Should have been turned into >")
1944 and_ordered
1945 LE ->
1946 assertPpr False (ppr "Should have been turned into >=")
1947 and_ordered
1948 _ -> and_ordered
1949
1950 plain_test = unitOL (
1951 JXX cond id
1952 ) `appOL` jmpFalse
1953 or_unordered = toOL [
1954 JXX cond id,
1955 JXX PARITY id
1956 ] `appOL` jmpFalse
1957 and_ordered = toOL [
1958 JXX PARITY false,
1959 JXX cond id,
1960 JXX ALWAYS false
1961 ]
1962 updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
1963 return (cond_code `appOL` code)
1964
1965 {- Note [Introducing cfg edges inside basic blocks]
1966 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1967
1968 During instruction selection a statement `s`
1969 in a block B with control of the sort: B -> C
1970 will sometimes result in control
1971 flow of the sort:
1972
1973 ┌ < ┐
1974 v ^
1975 B -> B1 ┴ -> C
1976
1977 as is the case for some atomic operations.
1978
1979 Now to keep the CFG in sync when introducing B1 we clearly
1980 want to insert it between B and C. However there is
1981 a catch when we have to deal with self loops.
1982
1983 We might start with code and a CFG of these forms:
1984
1985 loop:
1986 stmt1 ┌ < ┐
1987 .... v ^
1988 stmtX loop ┘
1989 stmtY
1990 ....
1991 goto loop:
1992
1993 Now we introduce B1:
1994 ┌ ─ ─ ─ ─ ─┐
1995 loop: │ ┌ < ┐ │
1996 instrs v │ │ ^
1997 .... loop ┴ B1 ┴ ┘
1998 instrsFromX
1999 stmtY
2000 goto loop:
2001
2002 This is simple, all outgoing edges from loop now simply
2003 start from B1 instead and the code generator knows which
2004 new edges it introduced for the self loop of B1.
2005
2006 Disaster strikes if the statement Y follows the same pattern.
2007 If we apply the same rule that all outgoing edges change then
2008 we end up with:
2009
2010 loop ─> B1 ─> B2 ┬─┐
2011 │ │ └─<┤ │
2012 │ └───<───┘ │
2013 └───────<────────┘
2014
2015 This is problematic. The edge B1->B1 is modified as expected.
2016 However the modification is wrong!
2017
2018 The assembly in this case looked like this:
2019
2020 _loop:
2021 <instrs>
2022 _B1:
2023 ...
2024 cmpxchgq ...
2025 jne _B1
2026 <instrs>
2027 <end _B1>
2028 _B2:
2029 ...
2030 cmpxchgq ...
2031 jne _B2
2032 <instrs>
2033 jmp loop
2034
2035 There is no edge _B2 -> _B1 here. It's still a self loop onto _B1.
2036
2037 The problem here is that really B1 should be two basic blocks.
2038 Otherwise we have control flow in the *middle* of a basic block.
2039 A contradiction!
2040
2041 So to account for this we add yet another basic block marker:
2042
2043 _B:
2044 <instrs>
2045 _B1:
2046 ...
2047 cmpxchgq ...
2048 jne _B1
2049 jmp _B1'
2050 _B1':
2051 <instrs>
2052 <end _B1>
2053 _B2:
2054 ...
2055
2056 Now when inserting B2 we will only look at the outgoing edges of B1' and
2057 everything will work out nicely.
2058
2059 You might also wonder why we don't insert jumps at the end of _B1'. There is
2060 no way another block ends up jumping to the labels _B1 or _B2 since they are
2061 essentially invisible to other blocks. View them as control flow labels local
2062 to the basic block if you'd like.
2063
2064 Not doing this ultimately caused (part 2 of) #17334.
2065 -}
2066
2067
2068 -- -----------------------------------------------------------------------------
2069 -- Generating C calls
2070
2071 -- Now the biggest nightmare---calls. Most of the nastiness is buried in
2072 -- @get_arg@, which moves the arguments to the correct registers/stack
2073 -- locations. Apart from that, the code is easy.
2074 --
2075 -- (If applicable) Do not fill the delay slots here; you will confuse the
2076 -- register allocator.
2077 --
2078 -- See Note [Keeping track of the current block] for information why we need
2079 -- to take/return a block id.
2080
2081 genCCall
2082 :: Bool -- 32 bit platform?
2083 -> ForeignTarget -- function to call
2084 -> [CmmFormal] -- where to put the result
2085 -> [CmmActual] -- arguments (of mixed type)
2086 -> BlockId -- The block we are in
2087 -> NatM (InstrBlock, Maybe BlockId)
2088
2089 -- First we deal with cases which might introduce new blocks in the stream.
2090
2091 genCCall is32Bit (PrimTarget (MO_AtomicRMW width amop))
2092 [dst] [addr, n] bid = do
2093 Amode amode addr_code <-
2094 if amop `elem` [AMO_Add, AMO_Sub]
2095 then getAmode addr
2096 else getSimpleAmode is32Bit addr -- See genCCall for MO_Cmpxchg
2097 arg <- getNewRegNat format
2098 arg_code <- getAnyReg n
2099 platform <- ncgPlatform <$> getConfig
2100
2101 let dst_r = getRegisterReg platform (CmmLocal dst)
2102 (code, lbl) <- op_code dst_r arg amode
2103 return (addr_code `appOL` arg_code arg `appOL` code, Just lbl)
2104 where
2105 -- Code for the operation
2106 op_code :: Reg -- Destination reg
2107 -> Reg -- Register containing argument
2108 -> AddrMode -- Address of location to mutate
2109 -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId
2110 op_code dst_r arg amode = case amop of
2111 -- In the common case where dst_r is a virtual register the
2112 -- final move should go away, because it's the last use of arg
2113 -- and the first use of dst_r.
2114 AMO_Add -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
2115 , MOV format (OpReg arg) (OpReg dst_r)
2116 ], bid)
2117 AMO_Sub -> return $ (toOL [ NEGI format (OpReg arg)
2118 , LOCK (XADD format (OpReg arg) (OpAddr amode))
2119 , MOV format (OpReg arg) (OpReg dst_r)
2120 ], bid)
2121 -- In these cases we need a new block id, and have to return it so
2122 -- that later instruction selection can reference it.
2123 AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
2124 AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
2125 , NOT format dst
2126 ])
2127 AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
2128 AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
2129 where
2130 -- Simulate operation that lacks a dedicated instruction using
2131 -- cmpxchg.
2132 cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
2133 -> NatM (OrdList Instr, BlockId)
2134 cmpxchg_code instrs = do
2135 lbl1 <- getBlockIdNat
2136 lbl2 <- getBlockIdNat
2137 tmp <- getNewRegNat format
2138
2139 --Record inserted blocks
2140 -- We turn A -> B into A -> A' -> A'' -> B
2141 -- with a self loop on A'.
2142 addImmediateSuccessorNat bid lbl1
2143 addImmediateSuccessorNat lbl1 lbl2
2144 updateCfgNat (addWeightEdge lbl1 lbl1 0)
2145
2146 return $ (toOL
2147 [ MOV format (OpAddr amode) (OpReg eax)
2148 , JXX ALWAYS lbl1
2149 , NEWBLOCK lbl1
2150 -- Keep old value so we can return it:
2151 , MOV format (OpReg eax) (OpReg dst_r)
2152 , MOV format (OpReg eax) (OpReg tmp)
2153 ]
2154 `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
2155 [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
2156 , JXX NE lbl1
2157 -- See Note [Introducing cfg edges inside basic blocks]
2158 -- why this basic block is required.
2159 , JXX ALWAYS lbl2
2160 , NEWBLOCK lbl2
2161 ],
2162 lbl2)
2163 format = intFormat width
2164
2165 genCCall is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
2166 | is32Bit, width == W64 = do
2167 ChildCode64 vcode rlo <- iselExpr64 src
2168 platform <- ncgPlatform <$> getConfig
2169 let rhi = getHiVRegFromLo rlo
2170 dst_r = getRegisterReg platform (CmmLocal dst)
2171 lbl1 <- getBlockIdNat
2172 lbl2 <- getBlockIdNat
2173 let format = if width == W8 then II16 else intFormat width
2174 tmp_r <- getNewRegNat format
2175
2176 -- New CFG Edges:
2177 -- bid -> lbl2
2178 -- bid -> lbl1 -> lbl2
2179 -- We also changes edges originating at bid to start at lbl2 instead.
2180 weights <- getCfgWeights
2181 updateCfgNat (addWeightEdge bid lbl1 110 .
2182 addWeightEdge lbl1 lbl2 110 .
2183 addImmediateSuccessor weights bid lbl2)
2184
2185 -- The following instruction sequence corresponds to the pseudo-code
2186 --
2187 -- if (src) {
2188 -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
2189 -- } else {
2190 -- dst = 64;
2191 -- }
2192 let !instrs = vcode `appOL` toOL
2193 ([ MOV II32 (OpReg rhi) (OpReg tmp_r)
2194 , OR II32 (OpReg rlo) (OpReg tmp_r)
2195 , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
2196 , JXX EQQ lbl2
2197 , JXX ALWAYS lbl1
2198
2199 , NEWBLOCK lbl1
2200 , BSF II32 (OpReg rhi) dst_r
2201 , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r)
2202 , BSF II32 (OpReg rlo) tmp_r
2203 , CMOV NE II32 (OpReg tmp_r) dst_r
2204 , JXX ALWAYS lbl2
2205
2206 , NEWBLOCK lbl2
2207 ])
2208 return (instrs, Just lbl2)
2209
2210 | otherwise = do
2211 code_src <- getAnyReg src
2212 config <- getConfig
2213 let platform = ncgPlatform config
2214 let dst_r = getRegisterReg platform (CmmLocal dst)
2215 if ncgBmiVersion config >= Just BMI2
2216 then do
2217 src_r <- getNewRegNat (intFormat width)
2218 let instrs = appOL (code_src src_r) $ case width of
2219 W8 -> toOL
2220 [ OR II32 (OpImm (ImmInteger 0xFFFFFF00)) (OpReg src_r)
2221 , TZCNT II32 (OpReg src_r) dst_r
2222 ]
2223 W16 -> toOL
2224 [ TZCNT II16 (OpReg src_r) dst_r
2225 , MOVZxL II16 (OpReg dst_r) (OpReg dst_r)
2226 ]
2227 _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r
2228 return (instrs, Nothing)
2229 else do
2230 -- The following insn sequence makes sure 'ctz 0' has a defined value.
2231 -- starting with Haswell, one could use the TZCNT insn instead.
2232 let format = if width == W8 then II16 else intFormat width
2233 src_r <- getNewRegNat format
2234 tmp_r <- getNewRegNat format
2235 let !instrs = code_src src_r `appOL` toOL
2236 ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
2237 [ BSF format (OpReg src_r) tmp_r
2238 , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
2239 , CMOV NE format (OpReg tmp_r) dst_r
2240 ]) -- NB: We don't need to zero-extend the result for the
2241 -- W8/W16 cases because the 'MOV' insn already
2242 -- took care of implicitly clearing the upper bits
2243 return (instrs, Nothing)
2244 where
2245 bw = widthInBits width
2246
2247 genCCall bits mop dst args bid = do
2248 config <- getConfig
2249 instr <- genCCall' config bits mop dst args bid
2250 return (instr, Nothing)
2251
2252 -- genCCall' handles cases not introducing new code blocks.
2253 genCCall'
2254 :: NCGConfig
2255 -> Bool -- 32 bit platform?
2256 -> ForeignTarget -- function to call
2257 -> [CmmFormal] -- where to put the result
2258 -> [CmmActual] -- arguments (of mixed type)
2259 -> BlockId -- The block we are in
2260 -> NatM InstrBlock
2261
2262 -- Unroll memcpy calls if the number of bytes to copy isn't too
2263 -- large. Otherwise, call C's memcpy.
2264 genCCall' config _ (PrimTarget (MO_Memcpy align)) _
2265 [dst, src, CmmLit (CmmInt n _)] _
2266 | fromInteger insns <= ncgInlineThresholdMemcpy config = do
2267 code_dst <- getAnyReg dst
2268 dst_r <- getNewRegNat format
2269 code_src <- getAnyReg src
2270 src_r <- getNewRegNat format
2271 tmp_r <- getNewRegNat format
2272 return $ code_dst dst_r `appOL` code_src src_r `appOL`
2273 go dst_r src_r tmp_r (fromInteger n)
2274 where
2275 platform = ncgPlatform config
2276 -- The number of instructions we will generate (approx). We need 2
2277 -- instructions per move.
2278 insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
2279
2280 maxAlignment = wordAlignment platform -- only machine word wide MOVs are supported
2281 effectiveAlignment = min (alignmentOf align) maxAlignment
2282 format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
2283
2284 -- The size of each move, in bytes.
2285 sizeBytes :: Integer
2286 sizeBytes = fromIntegral (formatInBytes format)
2287
2288 go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
2289 go dst src tmp i
2290 | i >= sizeBytes =
2291 unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL`
2292 unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL`
2293 go dst src tmp (i - sizeBytes)
2294 -- Deal with remaining bytes.
2295 | i >= 4 = -- Will never happen on 32-bit
2296 unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
2297 unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
2298 go dst src tmp (i - 4)
2299 | i >= 2 =
2300 unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
2301 unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
2302 go dst src tmp (i - 2)
2303 | i >= 1 =
2304 unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
2305 unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
2306 go dst src tmp (i - 1)
2307 | otherwise = nilOL
2308 where
2309 src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone
2310 (ImmInteger (n - i))
2311 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
2312 (ImmInteger (n - i))
2313
2314 genCCall' config _ (PrimTarget (MO_Memset align)) _
2315 [dst,
2316 CmmLit (CmmInt c _),
2317 CmmLit (CmmInt n _)]
2318 _
2319 | fromInteger insns <= ncgInlineThresholdMemset config = do
2320 code_dst <- getAnyReg dst
2321 dst_r <- getNewRegNat format
2322 if format == II64 && n >= 8 then do
2323 code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64))
2324 imm8byte_r <- getNewRegNat II64
2325 return $ code_dst dst_r `appOL`
2326 code_imm8byte imm8byte_r `appOL`
2327 go8 dst_r imm8byte_r (fromInteger n)
2328 else
2329 return $ code_dst dst_r `appOL`
2330 go4 dst_r (fromInteger n)
2331 where
2332 platform = ncgPlatform config
2333 maxAlignment = wordAlignment platform -- only machine word wide MOVs are supported
2334 effectiveAlignment = min (alignmentOf align) maxAlignment
2335 format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
2336 c2 = c `shiftL` 8 .|. c
2337 c4 = c2 `shiftL` 16 .|. c2
2338 c8 = c4 `shiftL` 32 .|. c4
2339
2340 -- The number of instructions we will generate (approx). We need 1
2341 -- instructions per move.
2342 insns = (n + sizeBytes - 1) `div` sizeBytes
2343
2344 -- The size of each move, in bytes.
2345 sizeBytes :: Integer
2346 sizeBytes = fromIntegral (formatInBytes format)
2347
2348 -- Depending on size returns the widest MOV instruction and its
2349 -- width.
2350 gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
2351 gen4 addr size
2352 | size >= 4 =
2353 (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
2354 | size >= 2 =
2355 (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
2356 | size >= 1 =
2357 (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
2358 | otherwise = (nilOL, 0)
2359
2360 -- Generates a 64-bit wide MOV instruction from REG to MEM.
2361 gen8 :: AddrMode -> Reg -> InstrBlock
2362 gen8 addr reg8byte =
2363 unitOL (MOV format (OpReg reg8byte) (OpAddr addr))
2364
2365 -- Unrolls memset when the widest MOV is <= 4 bytes.
2366 go4 :: Reg -> Integer -> InstrBlock
2367 go4 dst left =
2368 if left <= 0 then nilOL
2369 else curMov `appOL` go4 dst (left - curWidth)
2370 where
2371 possibleWidth = minimum [left, sizeBytes]
2372 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
2373 (curMov, curWidth) = gen4 dst_addr possibleWidth
2374
2375 -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg
2376 -- argument). Falls back to go4 when all 8 byte moves are
2377 -- exhausted.
2378 go8 :: Reg -> Reg -> Integer -> InstrBlock
2379 go8 dst reg8byte left =
2380 if possibleWidth >= 8 then
2381 let curMov = gen8 dst_addr reg8byte
2382 in curMov `appOL` go8 dst reg8byte (left - 8)
2383 else go4 dst left
2384 where
2385 possibleWidth = minimum [left, sizeBytes]
2386 dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
2387
2388 genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL
2389 genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
2390 -- barriers compile to no code on x86/x86-64;
2391 -- we keep it this long in order to prevent earlier optimisations.
2392
2393 genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
2394
2395 genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ =
2396 case n of
2397 0 -> genPrefetch src $ PREFETCH NTA format
2398 1 -> genPrefetch src $ PREFETCH Lvl2 format
2399 2 -> genPrefetch src $ PREFETCH Lvl1 format
2400 3 -> genPrefetch src $ PREFETCH Lvl0 format
2401 l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l)
2402 -- the c / llvm prefetch convention is 0, 1, 2, and 3
2403 -- the x86 corresponding names are : NTA, 2 , 1, and 0
2404 where
2405 format = archWordFormat is32bit
2406 -- need to know what register width for pointers!
2407 genPrefetch inRegSrc prefetchCTor =
2408 do
2409 code_src <- getAnyReg inRegSrc
2410 src_r <- getNewRegNat format
2411 return $ code_src src_r `appOL`
2412 (unitOL (prefetchCTor (OpAddr
2413 ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) ))
2414 -- prefetch always takes an address
2415
2416 genCCall' _ is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
2417 platform <- ncgPlatform <$> getConfig
2418 let dst_r = getRegisterReg platform (CmmLocal dst)
2419 case width of
2420 W64 | is32Bit -> do
2421 ChildCode64 vcode rlo <- iselExpr64 src
2422 let dst_rhi = getHiVRegFromLo dst_r
2423 rhi = getHiVRegFromLo rlo
2424 return $ vcode `appOL`
2425 toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi),
2426 MOV II32 (OpReg rhi) (OpReg dst_r),
2427 BSWAP II32 dst_rhi,
2428 BSWAP II32 dst_r ]
2429 W16 -> do code_src <- getAnyReg src
2430 return $ code_src dst_r `appOL`
2431 unitOL (BSWAP II32 dst_r) `appOL`
2432 unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r))
2433 _ -> do code_src <- getAnyReg src
2434 return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r)
2435 where
2436 format = intFormat width
2437
2438 genCCall' config is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
2439 args@[src] bid = do
2440 sse4_2 <- sse4_2Enabled
2441 let platform = ncgPlatform config
2442 if sse4_2
2443 then do code_src <- getAnyReg src
2444 src_r <- getNewRegNat format
2445 let dst_r = getRegisterReg platform (CmmLocal dst)
2446 return $ code_src src_r `appOL`
2447 (if width == W8 then
2448 -- The POPCNT instruction doesn't take a r/m8
2449 unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL`
2450 unitOL (POPCNT II16 (OpReg src_r) dst_r)
2451 else
2452 unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL`
2453 (if width == W8 || width == W16 then
2454 -- We used a 16-bit destination register above,
2455 -- so zero-extend
2456 unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
2457 else nilOL)
2458 else do
2459 targetExpr <- cmmMakeDynamicReference config
2460 CallReference lbl
2461 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2462 [NoHint] [NoHint]
2463 CmmMayReturn)
2464 genCCall' config is32Bit target dest_regs args bid
2465 where
2466 format = intFormat width
2467 lbl = mkCmmCodeLabel primUnitId (popCntLabel width)
2468
2469 genCCall' config is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
2470 args@[src, mask] bid = do
2471 let platform = ncgPlatform config
2472 if ncgBmiVersion config >= Just BMI2
2473 then do code_src <- getAnyReg src
2474 code_mask <- getAnyReg mask
2475 src_r <- getNewRegNat format
2476 mask_r <- getNewRegNat format
2477 let dst_r = getRegisterReg platform (CmmLocal dst)
2478 return $ code_src src_r `appOL` code_mask mask_r `appOL`
2479 -- PDEP only supports > 32 bit args
2480 ( if width == W8 || width == W16 then
2481 toOL
2482 [ MOVZxL format (OpReg src_r ) (OpReg src_r )
2483 , MOVZxL format (OpReg mask_r) (OpReg mask_r)
2484 , PDEP II32 (OpReg mask_r) (OpReg src_r ) dst_r
2485 , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
2486 ]
2487 else
2488 unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)
2489 )
2490 else do
2491 targetExpr <- cmmMakeDynamicReference config
2492 CallReference lbl
2493 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2494 [NoHint] [NoHint]
2495 CmmMayReturn)
2496 genCCall' config is32Bit target dest_regs args bid
2497 where
2498 format = intFormat width
2499 lbl = mkCmmCodeLabel primUnitId (pdepLabel width)
2500
2501 genCCall' config is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
2502 args@[src, mask] bid = do
2503 let platform = ncgPlatform config
2504 if ncgBmiVersion config >= Just BMI2
2505 then do code_src <- getAnyReg src
2506 code_mask <- getAnyReg mask
2507 src_r <- getNewRegNat format
2508 mask_r <- getNewRegNat format
2509 let dst_r = getRegisterReg platform (CmmLocal dst)
2510 return $ code_src src_r `appOL` code_mask mask_r `appOL`
2511 (if width == W8 || width == W16 then
2512 -- The PEXT instruction doesn't take a r/m8 or 16
2513 toOL
2514 [ MOVZxL format (OpReg src_r ) (OpReg src_r )
2515 , MOVZxL format (OpReg mask_r) (OpReg mask_r)
2516 , PEXT II32 (OpReg mask_r) (OpReg src_r ) dst_r
2517 , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
2518 ]
2519 else
2520 unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)
2521 )
2522 else do
2523 targetExpr <- cmmMakeDynamicReference config
2524 CallReference lbl
2525 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2526 [NoHint] [NoHint]
2527 CmmMayReturn)
2528 genCCall' config is32Bit target dest_regs args bid
2529 where
2530 format = intFormat width
2531 lbl = mkCmmCodeLabel primUnitId (pextLabel width)
2532
2533 genCCall' config is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
2534 | is32Bit && width == W64 = do
2535 -- Fallback to `hs_clz64` on i386
2536 targetExpr <- cmmMakeDynamicReference config CallReference lbl
2537 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2538 [NoHint] [NoHint]
2539 CmmMayReturn)
2540 genCCall' config is32Bit target dest_regs args bid
2541
2542 | otherwise = do
2543 code_src <- getAnyReg src
2544 config <- getConfig
2545 let platform = ncgPlatform config
2546 let dst_r = getRegisterReg platform (CmmLocal dst)
2547 if ncgBmiVersion config >= Just BMI2
2548 then do
2549 src_r <- getNewRegNat (intFormat width)
2550 return $ appOL (code_src src_r) $ case width of
2551 W8 -> toOL
2552 [ MOVZxL II8 (OpReg src_r) (OpReg src_r) -- zero-extend to 32 bit
2553 , LZCNT II32 (OpReg src_r) dst_r -- lzcnt with extra 24 zeros
2554 , SUB II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros
2555 ]
2556 W16 -> toOL
2557 [ LZCNT II16 (OpReg src_r) dst_r
2558 , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit
2559 ]
2560 _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r)
2561 else do
2562 let format = if width == W8 then II16 else intFormat width
2563 src_r <- getNewRegNat format
2564 tmp_r <- getNewRegNat format
2565 return $ code_src src_r `appOL` toOL
2566 ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
2567 [ BSR format (OpReg src_r) tmp_r
2568 , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
2569 , CMOV NE format (OpReg tmp_r) dst_r
2570 , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
2571 ]) -- NB: We don't need to zero-extend the result for the
2572 -- W8/W16 cases because the 'MOV' insn already
2573 -- took care of implicitly clearing the upper bits
2574 where
2575 bw = widthInBits width
2576 lbl = mkCmmCodeLabel primUnitId (clzLabel width)
2577
2578 genCCall' config is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
2579 targetExpr <- cmmMakeDynamicReference config
2580 CallReference lbl
2581 let target = ForeignTarget targetExpr (ForeignConvention CCallConv
2582 [NoHint] [NoHint]
2583 CmmMayReturn)
2584 genCCall' config is32Bit target dest_regs args bid
2585 where
2586 lbl = mkCmmCodeLabel primUnitId (word2FloatLabel width)
2587
2588 genCCall' _ _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
2589 load_code <- intLoadCode (MOV (intFormat width)) addr
2590 platform <- ncgPlatform <$> getConfig
2591
2592 return (load_code (getRegisterReg platform (CmmLocal dst)))
2593
2594 genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
2595 code <- assignMem_IntCode (intFormat width) addr val
2596 return $ code `snocOL` MFENCE
2597
2598 genCCall' _ is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _
2599 -- On x86 we don't have enough registers to use cmpxchg with a
2600 -- complicated addressing mode, so on that architecture we
2601 -- pre-compute the address first.
2602 | not (is32Bit && width == W64) = do
2603 Amode amode addr_code <- getSimpleAmode is32Bit addr
2604 newval <- getNewRegNat format
2605 newval_code <- getAnyReg new
2606 oldval <- getNewRegNat format
2607 oldval_code <- getAnyReg old
2608 platform <- getPlatform
2609 let dst_r = getRegisterReg platform (CmmLocal dst)
2610 code = toOL
2611 [ MOV format (OpReg oldval) (OpReg eax)
2612 , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
2613 , MOV format (OpReg eax) (OpReg dst_r)
2614 ]
2615 return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
2616 `appOL` code
2617 where
2618 format = intFormat width
2619
2620 genCCall' config is32Bit (PrimTarget (MO_Xchg width)) [dst] [addr, value] _
2621 | (is32Bit && width == W64) = panic "gencCall: 64bit atomic exchange not supported on 32bit platforms"
2622 | otherwise = do
2623 let dst_r = getRegisterReg platform (CmmLocal dst)
2624 Amode amode addr_code <- getSimpleAmode is32Bit addr
2625 (newval, newval_code) <- getSomeReg value
2626 -- Copy the value into the target register, perform the exchange.
2627 let code = toOL
2628 [ MOV format (OpReg newval) (OpReg dst_r)
2629 -- On X86 xchg implies a lock prefix if we use a memory argument.
2630 -- so this is atomic.
2631 , XCHG format (OpAddr amode) dst_r
2632 ]
2633 return $ addr_code `appOL` newval_code `appOL` code
2634 where
2635 format = intFormat width
2636 platform = ncgPlatform config
2637
2638 genCCall' _ is32Bit target dest_regs args bid = do
2639 platform <- ncgPlatform <$> getConfig
2640 case (target, dest_regs) of
2641 -- void return type prim op
2642 (PrimTarget op, []) ->
2643 outOfLineCmmOp bid op Nothing args
2644 -- we only cope with a single result for foreign calls
2645 (PrimTarget op, [r]) -> case op of
2646 MO_F32_Fabs -> case args of
2647 [x] -> sse2FabsCode W32 x
2648 _ -> panic "genCCall: Wrong number of arguments for fabs"
2649 MO_F64_Fabs -> case args of
2650 [x] -> sse2FabsCode W64 x
2651 _ -> panic "genCCall: Wrong number of arguments for fabs"
2652
2653 MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
2654 MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
2655 _other_op -> outOfLineCmmOp bid op (Just r) args
2656
2657 where
2658 actuallyInlineSSE2Op = actuallyInlineFloatOp'
2659
2660 actuallyInlineFloatOp' instr format [x]
2661 = do res <- trivialUFCode format (instr format) x
2662 any <- anyReg res
2663 return (any (getRegisterReg platform (CmmLocal r)))
2664
2665 actuallyInlineFloatOp' _ _ args
2666 = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
2667 ++ show (length args) ++ ")"
2668
2669 sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock
2670 sse2FabsCode w x = do
2671 let fmt = floatFormat w
2672 x_code <- getAnyReg x
2673 let
2674 const | FF32 <- fmt = CmmInt 0x7fffffff W32
2675 | otherwise = CmmInt 0x7fffffffffffffff W64
2676 Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
2677 tmp <- getNewRegNat fmt
2678 let
2679 code dst = x_code dst `appOL` amode_code `appOL` toOL [
2680 MOV fmt (OpAddr amode) (OpReg tmp),
2681 AND fmt (OpReg tmp) (OpReg dst)
2682 ]
2683
2684 return $ code (getRegisterReg platform (CmmLocal r))
2685
2686 (PrimTarget (MO_S_QuotRem width), _) -> divOp1 platform True width dest_regs args
2687 (PrimTarget (MO_U_QuotRem width), _) -> divOp1 platform False width dest_regs args
2688 (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
2689 (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
2690 case args of
2691 [arg_x, arg_y] ->
2692 do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
2693 let format = intFormat width
2694 lCode <- anyReg =<< trivialCode width (ADD_CC format)
2695 (Just (ADD_CC format)) arg_x arg_y
2696 let reg_l = getRegisterReg platform (CmmLocal res_l)
2697 reg_h = getRegisterReg platform (CmmLocal res_h)
2698 code = hCode reg_h `appOL`
2699 lCode reg_l `snocOL`
2700 ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
2701 return code
2702 _ -> panic "genCCall: Wrong number of arguments/results for add2"
2703 (PrimTarget (MO_AddWordC width), [res_r, res_c]) ->
2704 addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args
2705 (PrimTarget (MO_SubWordC width), [res_r, res_c]) ->
2706 addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args
2707 (PrimTarget (MO_AddIntC width), [res_r, res_c]) ->
2708 addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args
2709 (PrimTarget (MO_SubIntC width), [res_r, res_c]) ->
2710 addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args
2711 (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
2712 case args of
2713 [arg_x, arg_y] ->
2714 do (y_reg, y_code) <- getRegOrMem arg_y
2715 x_code <- getAnyReg arg_x
2716 let format = intFormat width
2717 reg_h = getRegisterReg platform (CmmLocal res_h)
2718 reg_l = getRegisterReg platform (CmmLocal res_l)
2719 code = y_code `appOL`
2720 x_code rax `appOL`
2721 toOL [MUL2 format y_reg,
2722 MOV format (OpReg rdx) (OpReg reg_h),
2723 MOV format (OpReg rax) (OpReg reg_l)]
2724 return code
2725 _ -> panic "genCCall: Wrong number of arguments/results for mul2"
2726 (PrimTarget (MO_S_Mul2 width), [res_c, res_h, res_l]) ->
2727 case args of
2728 [arg_x, arg_y] ->
2729 do (y_reg, y_code) <- getRegOrMem arg_y
2730 x_code <- getAnyReg arg_x
2731 reg_tmp <- getNewRegNat II8
2732 let format = intFormat width
2733 reg_h = getRegisterReg platform (CmmLocal res_h)
2734 reg_l = getRegisterReg platform (CmmLocal res_l)
2735 reg_c = getRegisterReg platform (CmmLocal res_c)
2736 code = y_code `appOL`
2737 x_code rax `appOL`
2738 toOL [ IMUL2 format y_reg
2739 , MOV format (OpReg rdx) (OpReg reg_h)
2740 , MOV format (OpReg rax) (OpReg reg_l)
2741 , SETCC CARRY (OpReg reg_tmp)
2742 , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
2743 ]
2744 return code
2745 _ -> panic "genCCall: Wrong number of arguments/results for imul2"
2746
2747 _ -> do
2748 (instrs0, args') <- evalArgs bid args
2749 instrs1 <- if is32Bit
2750 then genCCall32' target dest_regs args'
2751 else genCCall64' target dest_regs args'
2752 return (instrs0 `appOL` instrs1)
2753
2754 where divOp1 platform signed width results [arg_x, arg_y]
2755 = divOp platform signed width results Nothing arg_x arg_y
2756 divOp1 _ _ _ _ _
2757 = panic "genCCall: Wrong number of arguments for divOp1"
2758 divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
2759 = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
2760 divOp2 _ _ _ _ _
2761 = panic "genCCall: Wrong number of arguments for divOp2"
2762
2763 -- See Note [DIV/IDIV for bytes]
2764 divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
2765 let widen | signed = MO_SS_Conv W8 W16
2766 | otherwise = MO_UU_Conv W8 W16
2767 arg_x_low_16 = CmmMachOp widen [arg_x_low]
2768 arg_y_16 = CmmMachOp widen [arg_y]
2769 m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
2770 in divOp
2771 platform signed W16 [res_q, res_r]
2772 m_arg_x_high_16 arg_x_low_16 arg_y_16
2773
2774 divOp platform signed width [res_q, res_r]
2775 m_arg_x_high arg_x_low arg_y
2776 = do let format = intFormat width
2777 reg_q = getRegisterReg platform (CmmLocal res_q)
2778 reg_r = getRegisterReg platform (CmmLocal res_r)
2779 widen | signed = CLTD format
2780 | otherwise = XOR format (OpReg rdx) (OpReg rdx)
2781 instr | signed = IDIV
2782 | otherwise = DIV
2783 (y_reg, y_code) <- getRegOrMem arg_y
2784 x_low_code <- getAnyReg arg_x_low
2785 x_high_code <- case m_arg_x_high of
2786 Just arg_x_high ->
2787 getAnyReg arg_x_high
2788 Nothing ->
2789 return $ const $ unitOL widen
2790 return $ y_code `appOL`
2791 x_low_code rax `appOL`
2792 x_high_code rdx `appOL`
2793 toOL [instr format y_reg,
2794 MOV format (OpReg rax) (OpReg reg_q),
2795 MOV format (OpReg rdx) (OpReg reg_r)]
2796 divOp _ _ _ _ _ _ _
2797 = panic "genCCall: Wrong number of results for divOp"
2798
2799 addSubIntC platform instr mrevinstr cond width
2800 res_r res_c [arg_x, arg_y]
2801 = do let format = intFormat width
2802 rCode <- anyReg =<< trivialCode width (instr format)
2803 (mrevinstr format) arg_x arg_y
2804 reg_tmp <- getNewRegNat II8
2805 let reg_c = getRegisterReg platform (CmmLocal res_c)
2806 reg_r = getRegisterReg platform (CmmLocal res_r)
2807 code = rCode reg_r `snocOL`
2808 SETCC cond (OpReg reg_tmp) `snocOL`
2809 MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
2810
2811 return code
2812 addSubIntC _ _ _ _ _ _ _ _
2813 = panic "genCCall: Wrong number of arguments/results for addSubIntC"
2814
2815 {-
2816 Note [Evaluate C-call arguments before placing in destination registers]
2817 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2818
2819 When producing code for C calls we must take care when placing arguments
2820 in their final registers. Specifically, we must ensure that temporary register
2821 usage due to evaluation of one argument does not clobber a register in which we
2822 already placed a previous argument (e.g. as the code generation logic for
2823 MO_Shl can clobber %rcx due to x86 instruction limitations).
2824
2825 This is precisely what happened in #18527. Consider this C--:
2826
2827 (result::I64) = call "ccall" doSomething(_s2hp::I64, 2244, _s2hq::I64, _s2hw::I64 | (1 << _s2hz::I64));
2828
2829 Here we are calling the C function `doSomething` with three arguments, the last
2830 involving a non-trivial expression involving MO_Shl. In this case the NCG could
2831 naively generate the following assembly (where $tmp denotes some temporary
2832 register and $argN denotes the register for argument N, as dictated by the
2833 platform's calling convention):
2834
2835 mov _s2hp, $arg1 # place first argument
2836 mov _s2hq, $arg2 # place second argument
2837
2838 # Compute 1 << _s2hz
2839 mov _s2hz, %rcx
2840 shl %cl, $tmp
2841
2842 # Compute (_s2hw | (1 << _s2hz))
2843 mov _s2hw, $arg3
2844 or $tmp, $arg3
2845
2846 # Perform the call
2847 call func
2848
2849 This code is outright broken on Windows which assigns $arg1 to %rcx. This means
2850 that the evaluation of the last argument clobbers the first argument.
2851
2852 To avoid this we use a rather awful hack: when producing code for a C call with
2853 at least one non-trivial argument, we first evaluate all of the arguments into
2854 local registers before moving them into their final calling-convention-defined
2855 homes. This is performed by 'evalArgs'. Here we define "non-trivial" to be an
2856 expression which might contain a MachOp since these are the only cases which
2857 might clobber registers. Furthermore, we use a conservative approximation of
2858 this condition (only looking at the top-level of CmmExprs) to avoid spending
2859 too much effort trying to decide whether we want to take the fast path.
2860
2861 Note that this hack *also* applies to calls to out-of-line PrimTargets (which
2862 are lowered via a C call) since outOfLineCmmOp produces the call via
2863 (stmtToInstrs (CmmUnsafeForeignCall ...)), which will ultimately end up
2864 back in genCCall{32,64}.
2865 -}
2866
2867 -- | See Note [Evaluate C-call arguments before placing in destination registers]
2868 evalArgs :: BlockId -> [CmmActual] -> NatM (InstrBlock, [CmmActual])
2869 evalArgs bid actuals
2870 | any mightContainMachOp actuals = do
2871 regs_blks <- mapM evalArg actuals
2872 return (concatOL $ map fst regs_blks, map snd regs_blks)
2873 | otherwise = return (nilOL, actuals)
2874 where
2875 mightContainMachOp (CmmReg _) = False
2876 mightContainMachOp (CmmRegOff _ _) = False
2877 mightContainMachOp (CmmLit _) = False
2878 mightContainMachOp _ = True
2879
2880 evalArg :: CmmActual -> NatM (InstrBlock, CmmExpr)
2881 evalArg actual = do
2882 platform <- getPlatform
2883 lreg <- newLocalReg $ cmmExprType platform actual
2884 (instrs, bid1) <- stmtToInstrs bid $ CmmAssign (CmmLocal lreg) actual
2885 -- The above assignment shouldn't change the current block
2886 massert (isNothing bid1)
2887 return (instrs, CmmReg $ CmmLocal lreg)
2888
2889 newLocalReg :: CmmType -> NatM LocalReg
2890 newLocalReg ty = LocalReg <$> getUniqueM <*> pure ty
2891
2892 -- Note [DIV/IDIV for bytes]
2893 --
2894 -- IDIV reminder:
2895 -- Size Dividend Divisor Quotient Remainder
2896 -- byte %ax r/m8 %al %ah
2897 -- word %dx:%ax r/m16 %ax %dx
2898 -- dword %edx:%eax r/m32 %eax %edx
2899 -- qword %rdx:%rax r/m64 %rax %rdx
2900 --
2901 -- We do a special case for the byte division because the current
2902 -- codegen doesn't deal well with accessing %ah register (also,
2903 -- accessing %ah in 64-bit mode is complicated because it cannot be an
2904 -- operand of many instructions). So we just widen operands to 16 bits
2905 -- and get the results from %al, %dl. This is not optimal, but a few
2906 -- register moves are probably not a huge deal when doing division.
2907
2908 genCCall32' :: ForeignTarget -- function to call
2909 -> [CmmFormal] -- where to put the result
2910 -> [CmmActual] -- arguments (of mixed type)
2911 -> NatM InstrBlock
2912 genCCall32' target dest_regs args = do
2913 config <- getConfig
2914 let platform = ncgPlatform config
2915 prom_args = map (maybePromoteCArg platform W32) args
2916
2917 -- If the size is smaller than the word, we widen things (see maybePromoteCArg)
2918 arg_size_bytes :: CmmType -> Int
2919 arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth platform))
2920
2921 roundTo a x | x `mod` a == 0 = x
2922 | otherwise = x + a - (x `mod` a)
2923
2924 push_arg :: CmmActual {-current argument-}
2925 -> NatM InstrBlock -- code
2926
2927 push_arg arg -- we don't need the hints on x86
2928 | isWord64 arg_ty = do
2929 ChildCode64 code r_lo <- iselExpr64 arg
2930 delta <- getDeltaNat
2931 setDeltaNat (delta - 8)
2932 let r_hi = getHiVRegFromLo r_lo
2933 return ( code `appOL`
2934 toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
2935 PUSH II32 (OpReg r_lo), DELTA (delta - 8),
2936 DELTA (delta-8)]
2937 )
2938
2939 | isFloatType arg_ty = do
2940 (reg, code) <- getSomeReg arg
2941 delta <- getDeltaNat
2942 setDeltaNat (delta-size)
2943 return (code `appOL`
2944 toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
2945 DELTA (delta-size),
2946 let addr = AddrBaseIndex (EABaseReg esp)
2947 EAIndexNone
2948 (ImmInt 0)
2949 format = floatFormat (typeWidth arg_ty)
2950 in
2951
2952 -- assume SSE2
2953 MOV format (OpReg reg) (OpAddr addr)
2954
2955 ]
2956 )
2957
2958 | otherwise = do
2959 -- Arguments can be smaller than 32-bit, but we still use @PUSH
2960 -- II32@ - the usual calling conventions expect integers to be
2961 -- 4-byte aligned.
2962 massert ((typeWidth arg_ty) <= W32)
2963 (operand, code) <- getOperand arg
2964 delta <- getDeltaNat
2965 setDeltaNat (delta-size)
2966 return (code `snocOL`
2967 PUSH II32 operand `snocOL`
2968 DELTA (delta-size))
2969
2970 where
2971 arg_ty = cmmExprType platform arg
2972 size = arg_size_bytes arg_ty -- Byte size
2973
2974 let
2975 -- Align stack to 16n for calls, assuming a starting stack
2976 -- alignment of 16n - word_size on procedure entry. Which we
2977 -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
2978 sizes = map (arg_size_bytes . cmmExprType platform) (reverse args)
2979 raw_arg_size = sum sizes + platformWordSizeInBytes platform
2980 arg_pad_size = (roundTo 16 $ raw_arg_size) - raw_arg_size
2981 tot_arg_size = raw_arg_size + arg_pad_size - platformWordSizeInBytes platform
2982
2983
2984 delta0 <- getDeltaNat
2985 setDeltaNat (delta0 - arg_pad_size)
2986
2987 push_codes <- mapM push_arg (reverse prom_args)
2988 delta <- getDeltaNat
2989 massert (delta == delta0 - tot_arg_size)
2990
2991 -- deal with static vs dynamic call targets
2992 (callinsns,cconv) <-
2993 case target of
2994 ForeignTarget (CmmLit (CmmLabel lbl)) conv
2995 -> -- ToDo: stdcall arg sizes
2996 return (unitOL (CALL (Left fn_imm) []), conv)
2997 where fn_imm = ImmCLbl lbl
2998 ForeignTarget expr conv
2999 -> do { (dyn_r, dyn_c) <- getSomeReg expr
3000 ; massert (isWord32 (cmmExprType platform expr))
3001 ; return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
3002 PrimTarget _
3003 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
3004 ++ "probably because too many return values."
3005
3006 let push_code
3007 | arg_pad_size /= 0
3008 = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
3009 DELTA (delta0 - arg_pad_size)]
3010 `appOL` concatOL push_codes
3011 | otherwise
3012 = concatOL push_codes
3013
3014 -- Deallocate parameters after call for ccall;
3015 -- but not for stdcall (callee does it)
3016 --
3017 -- We have to pop any stack padding we added
3018 -- even if we are doing stdcall, though (#5052)
3019 pop_size
3020 | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size
3021 | otherwise = tot_arg_size
3022
3023 call = callinsns `appOL`
3024 toOL (
3025 (if pop_size==0 then [] else
3026 [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
3027 ++
3028 [DELTA delta0]
3029 )
3030 setDeltaNat delta0
3031
3032 let
3033 -- assign the results, if necessary
3034 assign_code [] = nilOL
3035 assign_code [dest]
3036 | isFloatType ty =
3037 -- we assume SSE2
3038 let tmp_amode = AddrBaseIndex (EABaseReg esp)
3039 EAIndexNone
3040 (ImmInt 0)
3041 fmt = floatFormat w
3042 in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
3043 DELTA (delta0 - b),
3044 X87Store fmt tmp_amode,
3045 -- X87Store only supported for the CDECL ABI
3046 -- NB: This code will need to be
3047 -- revisited once GHC does more work around
3048 -- SIGFPE f
3049 MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
3050 ADD II32 (OpImm (ImmInt b)) (OpReg esp),
3051 DELTA delta0]
3052 | isWord64 ty = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
3053 MOV II32 (OpReg edx) (OpReg r_dest_hi)]
3054 | otherwise = unitOL (MOV (intFormat w)
3055 (OpReg eax)
3056 (OpReg r_dest))
3057 where
3058 ty = localRegType dest
3059 w = typeWidth ty
3060 b = widthInBytes w
3061 r_dest_hi = getHiVRegFromLo r_dest
3062 r_dest = getRegisterReg platform (CmmLocal dest)
3063 assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
3064
3065 return (push_code `appOL`
3066 call `appOL`
3067 assign_code dest_regs)
3068
3069 genCCall64' :: ForeignTarget -- function to call
3070 -> [CmmFormal] -- where to put the result
3071 -> [CmmActual] -- arguments (of mixed type)
3072 -> NatM InstrBlock
3073 genCCall64' target dest_regs args = do
3074 platform <- getPlatform
3075 -- load up the register arguments
3076 let prom_args = map (maybePromoteCArg platform W32) args
3077
3078 let load_args :: [CmmExpr]
3079 -> [Reg] -- int regs avail for args
3080 -> [Reg] -- FP regs avail for args
3081 -> InstrBlock -- code computing args
3082 -> InstrBlock -- code assigning args to ABI regs
3083 -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
3084 -- no more regs to use
3085 load_args args [] [] code acode =
3086 return (args, [], [], code, acode)
3087
3088 -- no more args to push
3089 load_args [] aregs fregs code acode =
3090 return ([], aregs, fregs, code, acode)
3091
3092 load_args (arg : rest) aregs fregs code acode
3093 | isFloatType arg_rep = case fregs of
3094 [] -> push_this_arg
3095 (r:rs) -> do
3096 (code',acode') <- reg_this_arg r
3097 load_args rest aregs rs code' acode'
3098 | otherwise = case aregs of
3099 [] -> push_this_arg
3100 (r:rs) -> do
3101 (code',acode') <- reg_this_arg r
3102 load_args rest rs fregs code' acode'
3103 where
3104
3105 -- put arg into the list of stack pushed args
3106 push_this_arg = do
3107 (args',ars,frs,code',acode')
3108 <- load_args rest aregs fregs code acode
3109 return (arg:args', ars, frs, code', acode')
3110
3111 -- pass the arg into the given register
3112 reg_this_arg r
3113 -- "operand" args can be directly assigned into r
3114 | isOperand False arg = do
3115 arg_code <- getAnyReg arg
3116 return (code, (acode `appOL` arg_code r))
3117 -- The last non-operand arg can be directly assigned after its
3118 -- computation without going into a temporary register
3119 | all (isOperand False) rest = do
3120 arg_code <- getAnyReg arg
3121 return (code `appOL` arg_code r,acode)
3122
3123 -- other args need to be computed beforehand to avoid clobbering
3124 -- previously assigned registers used to pass parameters (see
3125 -- #11792, #12614). They are assigned into temporary registers
3126 -- and get assigned to proper call ABI registers after they all
3127 -- have been computed.
3128 | otherwise = do
3129 arg_code <- getAnyReg arg
3130 tmp <- getNewRegNat arg_fmt
3131 let
3132 code' = code `appOL` arg_code tmp
3133 acode' = acode `snocOL` reg2reg arg_fmt tmp r
3134 return (code',acode')
3135
3136 arg_rep = cmmExprType platform arg
3137 arg_fmt = cmmTypeFormat arg_rep
3138
3139 load_args_win :: [CmmExpr]
3140 -> [Reg] -- used int regs
3141 -> [Reg] -- used FP regs
3142 -> [(Reg, Reg)] -- (int, FP) regs avail for args
3143 -> InstrBlock
3144 -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
3145 load_args_win args usedInt usedFP [] code
3146 = return (args, usedInt, usedFP, code, nilOL)
3147 -- no more regs to use
3148 load_args_win [] usedInt usedFP _ code
3149 = return ([], usedInt, usedFP, code, nilOL)
3150 -- no more args to push
3151 load_args_win (arg : rest) usedInt usedFP
3152 ((ireg, freg) : regs) code
3153 | isFloatType arg_rep = do
3154 arg_code <- getAnyReg arg
3155 load_args_win rest (ireg : usedInt) (freg : usedFP) regs
3156 (code `appOL`
3157 arg_code freg `snocOL`
3158 -- If we are calling a varargs function
3159 -- then we need to define ireg as well
3160 -- as freg
3161 MOV II64 (OpReg freg) (OpReg ireg))
3162 | otherwise = do
3163 arg_code <- getAnyReg arg
3164 load_args_win rest (ireg : usedInt) usedFP regs
3165 (code `appOL` arg_code ireg)
3166 where
3167 arg_rep = cmmExprType platform arg
3168
3169 arg_size = 8 -- always, at the mo
3170
3171 push_args [] code = return code
3172 push_args (arg:rest) code
3173 | isFloatType arg_rep = do
3174 (arg_reg, arg_code) <- getSomeReg arg
3175 delta <- getDeltaNat
3176 setDeltaNat (delta-arg_size)
3177 let code' = code `appOL` arg_code `appOL` toOL [
3178 SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
3179 DELTA (delta-arg_size),
3180 MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel platform 0))]
3181 push_args rest code'
3182
3183 | otherwise = do
3184 -- Arguments can be smaller than 64-bit, but we still use @PUSH
3185 -- II64@ - the usual calling conventions expect integers to be
3186 -- 8-byte aligned.
3187 massert (width <= W64)
3188 (arg_op, arg_code) <- getOperand arg
3189 delta <- getDeltaNat
3190 setDeltaNat (delta-arg_size)
3191 let code' = code `appOL` arg_code `appOL` toOL [
3192 PUSH II64 arg_op,
3193 DELTA (delta-arg_size)]
3194 push_args rest code'
3195 where
3196 arg_rep = cmmExprType platform arg
3197 width = typeWidth arg_rep
3198
3199 leaveStackSpace n = do
3200 delta <- getDeltaNat
3201 setDeltaNat (delta - n * arg_size)
3202 return $ toOL [
3203 SUB II64 (OpImm (ImmInt (n * platformWordSizeInBytes platform))) (OpReg rsp),
3204 DELTA (delta - n * arg_size)]
3205
3206 (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
3207 <-
3208 if platformOS platform == OSMinGW32
3209 then load_args_win prom_args [] [] (allArgRegs platform) nilOL
3210 else do
3211 (stack_args, aregs, fregs, load_args_code, assign_args_code)
3212 <- load_args prom_args (allIntArgRegs platform)
3213 (allFPArgRegs platform)
3214 nilOL nilOL
3215 let used_regs rs as = reverse (drop (length rs) (reverse as))
3216 fregs_used = used_regs fregs (allFPArgRegs platform)
3217 aregs_used = used_regs aregs (allIntArgRegs platform)
3218 return (stack_args, aregs_used, fregs_used, load_args_code
3219 , assign_args_code)
3220
3221 let
3222 arg_regs_used = int_regs_used ++ fp_regs_used
3223 arg_regs = [eax] ++ arg_regs_used
3224 -- for annotating the call instruction with
3225 sse_regs = length fp_regs_used
3226 arg_stack_slots = if platformOS platform == OSMinGW32
3227 then length stack_args + length (allArgRegs platform)
3228 else length stack_args
3229 tot_arg_size = arg_size * arg_stack_slots
3230
3231
3232 -- Align stack to 16n for calls, assuming a starting stack
3233 -- alignment of 16n - word_size on procedure entry. Which we
3234 -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86]
3235 let word_size = platformWordSizeInBytes platform
3236 (real_size, adjust_rsp) <-
3237 if (tot_arg_size + word_size) `rem` 16 == 0
3238 then return (tot_arg_size, nilOL)
3239 else do -- we need to adjust...
3240 delta <- getDeltaNat
3241 setDeltaNat (delta - word_size)
3242 return (tot_arg_size + word_size, toOL [
3243 SUB II64 (OpImm (ImmInt word_size)) (OpReg rsp),
3244 DELTA (delta - word_size) ])
3245
3246 -- push the stack args, right to left
3247 push_code <- push_args (reverse stack_args) nilOL
3248 -- On Win64, we also have to leave stack space for the arguments
3249 -- that we are passing in registers
3250 lss_code <- if platformOS platform == OSMinGW32
3251 then leaveStackSpace (length (allArgRegs platform))
3252 else return nilOL
3253 delta <- getDeltaNat
3254
3255 -- deal with static vs dynamic call targets
3256 (callinsns,_cconv) <-
3257 case target of
3258 ForeignTarget (CmmLit (CmmLabel lbl)) conv
3259 -> -- ToDo: stdcall arg sizes
3260 return (unitOL (CALL (Left fn_imm) arg_regs), conv)
3261 where fn_imm = ImmCLbl lbl
3262 ForeignTarget expr conv
3263 -> do (dyn_r, dyn_c) <- getSomeReg expr
3264 return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
3265 PrimTarget _
3266 -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
3267 ++ "probably because too many return values."
3268
3269 let
3270 -- The x86_64 ABI requires us to set %al to the number of SSE2
3271 -- registers that contain arguments, if the called routine
3272 -- is a varargs function. We don't know whether it's a
3273 -- varargs function or not, so we have to assume it is.
3274 --
3275 -- It's not safe to omit this assignment, even if the number
3276 -- of SSE2 regs in use is zero. If %al is larger than 8
3277 -- on entry to a varargs function, seg faults ensue.
3278 assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
3279
3280 let call = callinsns `appOL`
3281 toOL (
3282 -- Deallocate parameters after call for ccall;
3283 -- stdcall has callee do it, but is not supported on
3284 -- x86_64 target (see #3336)
3285 (if real_size==0 then [] else
3286 [ADD (intFormat (platformWordWidth platform)) (OpImm (ImmInt real_size)) (OpReg esp)])
3287 ++
3288 [DELTA (delta + real_size)]
3289 )
3290 setDeltaNat (delta + real_size)
3291
3292 let
3293 -- assign the results, if necessary
3294 assign_code [] = nilOL
3295 assign_code [dest] =
3296 case typeWidth rep of
3297 W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
3298 (OpReg xmm0)
3299 (OpReg r_dest))
3300 W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
3301 (OpReg xmm0)
3302 (OpReg r_dest))
3303 _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
3304 where
3305 rep = localRegType dest
3306 r_dest = getRegisterReg platform (CmmLocal dest)
3307 assign_code _many = panic "genCCall.assign_code many"
3308
3309 return (adjust_rsp `appOL`
3310 push_code `appOL`
3311 load_args_code `appOL`
3312 assign_args_code `appOL`
3313 lss_code `appOL`
3314 assign_eax sse_regs `appOL`
3315 call `appOL`
3316 assign_code dest_regs)
3317
3318
3319 maybePromoteCArg :: Platform -> Width -> CmmExpr -> CmmExpr
3320 maybePromoteCArg platform wto arg
3321 | wfrom < wto = CmmMachOp (MO_UU_Conv wfrom wto) [arg]
3322 | otherwise = arg
3323 where
3324 wfrom = cmmExprWidth platform arg
3325
3326 outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual]
3327 -> NatM InstrBlock
3328 outOfLineCmmOp bid mop res args
3329 = do
3330 config <- getConfig
3331 targetExpr <- cmmMakeDynamicReference config CallReference lbl
3332 let target = ForeignTarget targetExpr
3333 (ForeignConvention CCallConv [] [] CmmMayReturn)
3334
3335 -- We know foreign calls results in no new basic blocks, so we can ignore
3336 -- the returned block id.
3337 (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
3338 return instrs
3339 where
3340 -- Assume we can call these functions directly, and that they're not in a dynamic library.
3341 -- TODO: Why is this ok? Under linux this code will be in libm.so
3342 -- Is it because they're really implemented as a primitive instruction by the assembler?? -- BL 2009/12/31
3343 lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
3344
3345 fn = case mop of
3346 MO_F32_Sqrt -> fsLit "sqrtf"
3347 MO_F32_Fabs -> fsLit "fabsf"
3348 MO_F32_Sin -> fsLit "sinf"
3349 MO_F32_Cos -> fsLit "cosf"
3350 MO_F32_Tan -> fsLit "tanf"
3351 MO_F32_Exp -> fsLit "expf"
3352 MO_F32_ExpM1 -> fsLit "expm1f"
3353 MO_F32_Log -> fsLit "logf"
3354 MO_F32_Log1P -> fsLit "log1pf"
3355
3356 MO_F32_Asin -> fsLit "asinf"
3357 MO_F32_Acos -> fsLit "acosf"
3358 MO_F32_Atan -> fsLit "atanf"
3359
3360 MO_F32_Sinh -> fsLit "sinhf"
3361 MO_F32_Cosh -> fsLit "coshf"
3362 MO_F32_Tanh -> fsLit "tanhf"
3363 MO_F32_Pwr -> fsLit "powf"
3364
3365 MO_F32_Asinh -> fsLit "asinhf"
3366 MO_F32_Acosh -> fsLit "acoshf"
3367 MO_F32_Atanh -> fsLit "atanhf"
3368
3369 MO_F64_Sqrt -> fsLit "sqrt"
3370 MO_F64_Fabs -> fsLit "fabs"
3371 MO_F64_Sin -> fsLit "sin"
3372 MO_F64_Cos -> fsLit "cos"
3373 MO_F64_Tan -> fsLit "tan"
3374 MO_F64_Exp -> fsLit "exp"
3375 MO_F64_ExpM1 -> fsLit "expm1"
3376 MO_F64_Log -> fsLit "log"
3377 MO_F64_Log1P -> fsLit "log1p"
3378
3379 MO_F64_Asin -> fsLit "asin"
3380 MO_F64_Acos -> fsLit "acos"
3381 MO_F64_Atan -> fsLit "atan"
3382
3383 MO_F64_Sinh -> fsLit "sinh"
3384 MO_F64_Cosh -> fsLit "cosh"
3385 MO_F64_Tanh -> fsLit "tanh"
3386 MO_F64_Pwr -> fsLit "pow"
3387
3388 MO_F64_Asinh -> fsLit "asinh"
3389 MO_F64_Acosh -> fsLit "acosh"
3390 MO_F64_Atanh -> fsLit "atanh"
3391
3392 MO_I64_ToI -> fsLit "hs_int64ToInt"
3393 MO_I64_FromI -> fsLit "hs_intToInt64"
3394 MO_W64_ToW -> fsLit "hs_word64ToWord"
3395 MO_W64_FromW -> fsLit "hs_wordToWord64"
3396 MO_x64_Neg -> fsLit "hs_neg64"
3397 MO_x64_Add -> fsLit "hs_add64"
3398 MO_x64_Sub -> fsLit "hs_sub64"
3399 MO_x64_Mul -> fsLit "hs_mul64"
3400 MO_I64_Quot -> fsLit "hs_quotInt64"
3401 MO_I64_Rem -> fsLit "hs_remInt64"
3402 MO_W64_Quot -> fsLit "hs_quotWord64"
3403 MO_W64_Rem -> fsLit "hs_remWord64"
3404 MO_x64_And -> fsLit "hs_and64"
3405 MO_x64_Or -> fsLit "hs_or64"
3406 MO_x64_Xor -> fsLit "hs_xor64"
3407 MO_x64_Not -> fsLit "hs_not64"
3408 MO_x64_Shl -> fsLit "hs_uncheckedShiftL64"
3409 MO_I64_Shr -> fsLit "hs_uncheckedIShiftRA64"
3410 MO_W64_Shr -> fsLit "hs_uncheckedShiftRL64"
3411 MO_x64_Eq -> fsLit "hs_eq64"
3412 MO_x64_Ne -> fsLit "hs_ne64"
3413 MO_I64_Ge -> fsLit "hs_geInt64"
3414 MO_I64_Gt -> fsLit "hs_gtInt64"
3415 MO_I64_Le -> fsLit "hs_leInt64"
3416 MO_I64_Lt -> fsLit "hs_ltInt64"
3417 MO_W64_Ge -> fsLit "hs_geWord64"
3418 MO_W64_Gt -> fsLit "hs_gtWord64"
3419 MO_W64_Le -> fsLit "hs_leWord64"
3420 MO_W64_Lt -> fsLit "hs_ltWord64"
3421
3422 MO_Memcpy _ -> fsLit "memcpy"
3423 MO_Memset _ -> fsLit "memset"
3424 MO_Memmove _ -> fsLit "memmove"
3425 MO_Memcmp _ -> fsLit "memcmp"
3426
3427 MO_SuspendThread -> fsLit "suspendThread"
3428 MO_ResumeThread -> fsLit "resumeThread"
3429
3430 MO_PopCnt _ -> fsLit "popcnt"
3431 MO_BSwap _ -> fsLit "bswap"
3432 {- Here the C implementation is used as there is no x86
3433 instruction to reverse a word's bit order.
3434 -}
3435 MO_BRev w -> bRevLabel w
3436 MO_Clz w -> clzLabel w
3437 MO_Ctz _ -> unsupported
3438
3439 MO_Pdep w -> pdepLabel w
3440 MO_Pext w -> pextLabel w
3441
3442 MO_AtomicRMW _ _ -> unsupported
3443 MO_AtomicRead _ -> unsupported
3444 MO_AtomicWrite _ -> unsupported
3445 MO_Cmpxchg w -> cmpxchgLabel w -- for W64 on 32-bit
3446 -- TODO: implement
3447 -- cmpxchg8b instr
3448 MO_Xchg _ -> should_be_inline
3449
3450 MO_UF_Conv _ -> unsupported
3451
3452 MO_S_Mul2 {} -> unsupported
3453 MO_S_QuotRem {} -> unsupported
3454 MO_U_QuotRem {} -> unsupported
3455 MO_U_QuotRem2 {} -> unsupported
3456 MO_Add2 {} -> unsupported
3457 MO_AddIntC {} -> unsupported
3458 MO_SubIntC {} -> unsupported
3459 MO_AddWordC {} -> unsupported
3460 MO_SubWordC {} -> unsupported
3461 MO_U_Mul2 {} -> unsupported
3462 MO_ReadBarrier -> unsupported
3463 MO_WriteBarrier -> unsupported
3464 MO_Touch -> unsupported
3465 (MO_Prefetch_Data _ ) -> unsupported
3466 unsupported = panic ("outOfLineCmmOp: " ++ show mop
3467 ++ " not supported here")
3468 -- If we generate a call for the given primop
3469 -- something went wrong.
3470 should_be_inline = panic ("outOfLineCmmOp: " ++ show mop
3471 ++ " should be handled inline")
3472
3473
3474 -- -----------------------------------------------------------------------------
3475 -- Generating a table-branch
3476
3477 genSwitch :: CmmExpr -> SwitchTargets -> NatM InstrBlock
3478
3479 genSwitch expr targets = do
3480 config <- getConfig
3481 let platform = ncgPlatform config
3482 -- We widen to a native-width register because we cannot use arbitry sizes
3483 -- in x86 addressing modes.
3484 exprWidened = CmmMachOp
3485 (MO_UU_Conv (cmmExprWidth platform expr)
3486 (platformWordWidth platform))
3487 [expr]
3488 indexExpr = cmmOffset platform exprWidened offset
3489 if ncgPIC config
3490 then do
3491 (reg,e_code) <- getNonClobberedReg indexExpr
3492 -- getNonClobberedReg because it needs to survive across t_code
3493 lbl <- getNewLabelNat
3494 let is32bit = target32Bit platform
3495 os = platformOS platform
3496 -- Might want to use .rodata.<function we're in> instead, but as
3497 -- long as it's something unique it'll work out since the
3498 -- references to the jump table are in the appropriate section.
3499 rosection = case os of
3500 -- on Mac OS X/x86_64, put the jump table in the text section to
3501 -- work around a limitation of the linker.
3502 -- ld64 is unable to handle the relocations for
3503 -- .quad L1 - L0
3504 -- if L0 is not preceded by a non-anonymous label in its section.
3505 OSDarwin | not is32bit -> Section Text lbl
3506 _ -> Section ReadOnlyData lbl
3507 dynRef <- cmmMakeDynamicReference config DataReference lbl
3508 (tableReg,t_code) <- getSomeReg $ dynRef
3509 let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
3510 (EAIndex reg (platformWordSizeInBytes platform)) (ImmInt 0))
3511
3512 offsetReg <- getNewRegNat (intFormat (platformWordWidth platform))
3513 return $ if is32bit || os == OSDarwin
3514 then e_code `appOL` t_code `appOL` toOL [
3515 ADD (intFormat (platformWordWidth platform)) op (OpReg tableReg),
3516 JMP_TBL (OpReg tableReg) ids rosection lbl
3517 ]
3518 else -- HACK: On x86_64 binutils<2.17 is only able to generate
3519 -- PC32 relocations, hence we only get 32-bit offsets in
3520 -- the jump table. As these offsets are always negative
3521 -- we need to properly sign extend them to 64-bit. This
3522 -- hack should be removed in conjunction with the hack in
3523 -- PprMach.hs/pprDataItem once binutils 2.17 is standard.
3524 e_code `appOL` t_code `appOL` toOL [
3525 MOVSxL II32 op (OpReg offsetReg),
3526 ADD (intFormat (platformWordWidth platform))
3527 (OpReg offsetReg)
3528 (OpReg tableReg),
3529 JMP_TBL (OpReg tableReg) ids rosection lbl
3530 ]
3531 else do
3532 (reg,e_code) <- getSomeReg indexExpr
3533 lbl <- getNewLabelNat
3534 let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (platformWordSizeInBytes platform)) (ImmCLbl lbl))
3535 code = e_code `appOL` toOL [
3536 JMP_TBL op ids (Section ReadOnlyData lbl) lbl
3537 ]
3538 return code
3539 where
3540 (offset, blockIds) = switchTargetsToTable targets
3541 ids = map (fmap DestBlockId) blockIds
3542
3543 generateJumpTableForInstr :: NCGConfig -> Instr -> Maybe (NatCmmDecl (Alignment, RawCmmStatics) Instr)
3544 generateJumpTableForInstr config (JMP_TBL _ ids section lbl)
3545 = let getBlockId (DestBlockId id) = id
3546 getBlockId _ = panic "Non-Label target in Jump Table"
3547 blockIds = map (fmap getBlockId) ids
3548 in Just (createJumpTable config blockIds section lbl)
3549 generateJumpTableForInstr _ _ = Nothing
3550
3551 createJumpTable :: NCGConfig -> [Maybe BlockId] -> Section -> CLabel
3552 -> GenCmmDecl (Alignment, RawCmmStatics) h g
3553 createJumpTable config ids section lbl
3554 = let jumpTable
3555 | ncgPIC config =
3556 let ww = ncgWordWidth config
3557 jumpTableEntryRel Nothing
3558 = CmmStaticLit (CmmInt 0 ww)
3559 jumpTableEntryRel (Just blockid)
3560 = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww)
3561 where blockLabel = blockLbl blockid
3562 in map jumpTableEntryRel ids
3563 | otherwise = map (jumpTableEntry config) ids
3564 in CmmData section (mkAlignment 1, CmmStaticsRaw lbl jumpTable)
3565
3566 extractUnwindPoints :: [Instr] -> [UnwindPoint]
3567 extractUnwindPoints instrs =
3568 [ UnwindPoint lbl unwinds | UNWIND lbl unwinds <- instrs]
3569
3570 -- -----------------------------------------------------------------------------
3571 -- 'condIntReg' and 'condFltReg': condition codes into registers
3572
3573 -- Turn those condition codes into integers now (when they appear on
3574 -- the right hand side of an assignment).
3575 --
3576 -- (If applicable) Do not fill the delay slots here; you will confuse the
3577 -- register allocator.
3578
3579 condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
3580
3581 condIntReg cond x y = do
3582 CondCode _ cond cond_code <- condIntCode cond x y
3583 tmp <- getNewRegNat II8
3584 let
3585 code dst = cond_code `appOL` toOL [
3586 SETCC cond (OpReg tmp),
3587 MOVZxL II8 (OpReg tmp) (OpReg dst)
3588 ]
3589 return (Any II32 code)
3590
3591
3592 -----------------------------------------------------------
3593 --- Note [SSE Parity Checks] ---
3594 -----------------------------------------------------------
3595
3596 -- We have to worry about unordered operands (eg. comparisons
3597 -- against NaN). If the operands are unordered, the comparison
3598 -- sets the parity flag, carry flag and zero flag.
3599 -- All comparisons are supposed to return false for unordered
3600 -- operands except for !=, which returns true.
3601 --
3602 -- Optimisation: we don't have to test the parity flag if we
3603 -- know the test has already excluded the unordered case: eg >
3604 -- and >= test for a zero carry flag, which can only occur for
3605 -- ordered operands.
3606 --
3607 -- By reversing comparisons we can avoid testing the parity
3608 -- for < and <= as well. If any of the arguments is an NaN we
3609 -- return false either way. If both arguments are valid then
3610 -- x <= y <-> y >= x holds. So it's safe to swap these.
3611 --
3612 -- We invert the condition inside getRegister'and getCondCode
3613 -- which should cover all invertable cases.
3614 -- All other functions translating FP comparisons to assembly
3615 -- use these to two generate the comparison code.
3616 --
3617 -- As an example consider a simple check:
3618 --
3619 -- func :: Float -> Float -> Int
3620 -- func x y = if x < y then 1 else 0
3621 --
3622 -- Which in Cmm gives the floating point comparison.
3623 --
3624 -- if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf;
3625 --
3626 -- We used to compile this to an assembly code block like this:
3627 -- _c2gh:
3628 -- ucomiss %xmm2,%xmm1
3629 -- jp _c2gf
3630 -- jb _c2gg
3631 -- jmp _c2gf
3632 --
3633 -- Where we have to introduce an explicit
3634 -- check for unordered results (using jmp parity):
3635 --
3636 -- We can avoid this by exchanging the arguments and inverting the direction
3637 -- of the comparison. This results in the sequence of:
3638 --
3639 -- ucomiss %xmm1,%xmm2
3640 -- ja _c2g2
3641 -- jmp _c2g1
3642 --
3643 -- Removing the jump reduces the pressure on the branch predidiction system
3644 -- and plays better with the uOP cache.
3645
3646 condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
3647 condFltReg is32Bit cond x y = condFltReg_sse2
3648 where
3649
3650
3651 condFltReg_sse2 = do
3652 CondCode _ cond cond_code <- condFltCode cond x y
3653 tmp1 <- getNewRegNat (archWordFormat is32Bit)
3654 tmp2 <- getNewRegNat (archWordFormat is32Bit)
3655 let -- See Note [SSE Parity Checks]
3656 code dst =
3657 cond_code `appOL`
3658 (case cond of
3659 NE -> or_unordered dst
3660 GU -> plain_test dst
3661 GEU -> plain_test dst
3662 -- Use ASSERT so we don't break releases if these creep in.
3663 LTT -> assertPpr False (ppr "Should have been turned into >") $
3664 and_ordered dst
3665 LE -> assertPpr False (ppr "Should have been turned into >=") $
3666 and_ordered dst
3667 _ -> and_ordered dst)
3668
3669 plain_test dst = toOL [
3670 SETCC cond (OpReg tmp1),
3671 MOVZxL II8 (OpReg tmp1) (OpReg dst)
3672 ]
3673 or_unordered dst = toOL [
3674 SETCC cond (OpReg tmp1),
3675 SETCC PARITY (OpReg tmp2),
3676 OR II8 (OpReg tmp1) (OpReg tmp2),
3677 MOVZxL II8 (OpReg tmp2) (OpReg dst)
3678 ]
3679 and_ordered dst = toOL [
3680 SETCC cond (OpReg tmp1),
3681 SETCC NOTPARITY (OpReg tmp2),
3682 AND II8 (OpReg tmp1) (OpReg tmp2),
3683 MOVZxL II8 (OpReg tmp2) (OpReg dst)
3684 ]
3685 return (Any II32 code)
3686
3687
3688 -- -----------------------------------------------------------------------------
3689 -- 'trivial*Code': deal with trivial instructions
3690
3691 -- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
3692 -- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
3693 -- Only look for constants on the right hand side, because that's
3694 -- where the generic optimizer will have put them.
3695
3696 -- Similarly, for unary instructions, we don't have to worry about
3697 -- matching an StInt as the argument, because genericOpt will already
3698 -- have handled the constant-folding.
3699
3700
3701 {-
3702 The Rules of the Game are:
3703
3704 * You cannot assume anything about the destination register dst;
3705 it may be anything, including a fixed reg.
3706
3707 * You may compute an operand into a fixed reg, but you may not
3708 subsequently change the contents of that fixed reg. If you
3709 want to do so, first copy the value either to a temporary
3710 or into dst. You are free to modify dst even if it happens
3711 to be a fixed reg -- that's not your problem.
3712
3713 * You cannot assume that a fixed reg will stay live over an
3714 arbitrary computation. The same applies to the dst reg.
3715
3716 * Temporary regs obtained from getNewRegNat are distinct from
3717 each other and from all other regs, and stay live over
3718 arbitrary computations.
3719
3720 --------------------
3721
3722 SDM's version of The Rules:
3723
3724 * If getRegister returns Any, that means it can generate correct
3725 code which places the result in any register, period. Even if that
3726 register happens to be read during the computation.
3727
3728 Corollary #1: this means that if you are generating code for an
3729 operation with two arbitrary operands, you cannot assign the result
3730 of the first operand into the destination register before computing
3731 the second operand. The second operand might require the old value
3732 of the destination register.
3733
3734 Corollary #2: A function might be able to generate more efficient
3735 code if it knows the destination register is a new temporary (and
3736 therefore not read by any of the sub-computations).
3737
3738 * If getRegister returns Any, then the code it generates may modify only:
3739 (a) fresh temporaries
3740 (b) the destination register
3741 (c) known registers (eg. %ecx is used by shifts)
3742 In particular, it may *not* modify global registers, unless the global
3743 register happens to be the destination register.
3744 -}
3745
3746 trivialCode :: Width -> (Operand -> Operand -> Instr)
3747 -> Maybe (Operand -> Operand -> Instr)
3748 -> CmmExpr -> CmmExpr -> NatM Register
3749 trivialCode width instr m a b
3750 = do is32Bit <- is32BitPlatform
3751 trivialCode' is32Bit width instr m a b
3752
3753 trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr)
3754 -> Maybe (Operand -> Operand -> Instr)
3755 -> CmmExpr -> CmmExpr -> NatM Register
3756 trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b
3757 | is32BitLit is32Bit lit_a = do
3758 b_code <- getAnyReg b
3759 let
3760 code dst
3761 = b_code dst `snocOL`
3762 revinstr (OpImm (litToImm lit_a)) (OpReg dst)
3763 return (Any (intFormat width) code)
3764
3765 trivialCode' _ width instr _ a b
3766 = genTrivialCode (intFormat width) instr a b
3767
3768 -- This is re-used for floating pt instructions too.
3769 genTrivialCode :: Format -> (Operand -> Operand -> Instr)
3770 -> CmmExpr -> CmmExpr -> NatM Register
3771 genTrivialCode rep instr a b = do
3772 (b_op, b_code) <- getNonClobberedOperand b
3773 a_code <- getAnyReg a
3774 tmp <- getNewRegNat rep
3775 let
3776 -- We want the value of b to stay alive across the computation of a.
3777 -- But, we want to calculate a straight into the destination register,
3778 -- because the instruction only has two operands (dst := dst `op` src).
3779 -- The troublesome case is when the result of b is in the same register
3780 -- as the destination reg. In this case, we have to save b in a
3781 -- new temporary across the computation of a.
3782 code dst
3783 | dst `regClashesWithOp` b_op =
3784 b_code `appOL`
3785 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
3786 a_code dst `snocOL`
3787 instr (OpReg tmp) (OpReg dst)
3788 | otherwise =
3789 b_code `appOL`
3790 a_code dst `snocOL`
3791 instr b_op (OpReg dst)
3792 return (Any rep code)
3793
3794 regClashesWithOp :: Reg -> Operand -> Bool
3795 reg `regClashesWithOp` OpReg reg2 = reg == reg2
3796 reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
3797 _ `regClashesWithOp` _ = False
3798
3799 -----------
3800
3801 trivialUCode :: Format -> (Operand -> Instr)
3802 -> CmmExpr -> NatM Register
3803 trivialUCode rep instr x = do
3804 x_code <- getAnyReg x
3805 let
3806 code dst =
3807 x_code dst `snocOL`
3808 instr (OpReg dst)
3809 return (Any rep code)
3810
3811 -----------
3812
3813
3814 trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
3815 -> CmmExpr -> CmmExpr -> NatM Register
3816 trivialFCode_sse2 pk instr x y
3817 = genTrivialCode format (instr format) x y
3818 where format = floatFormat pk
3819
3820
3821 trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
3822 trivialUFCode format instr x = do
3823 (x_reg, x_code) <- getSomeReg x
3824 let
3825 code dst =
3826 x_code `snocOL`
3827 instr x_reg dst
3828 return (Any format code)
3829
3830
3831 --------------------------------------------------------------------------------
3832 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
3833 coerceInt2FP from to x = coerce_sse2
3834 where
3835
3836 coerce_sse2 = do
3837 (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
3838 let
3839 opc = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
3840 n -> panic $ "coerceInt2FP.sse: unhandled width ("
3841 ++ show n ++ ")"
3842 code dst = x_code `snocOL` opc (intFormat from) x_op dst
3843 return (Any (floatFormat to) code)
3844 -- works even if the destination rep is <II32
3845
3846 --------------------------------------------------------------------------------
3847 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
3848 coerceFP2Int from to x = coerceFP2Int_sse2
3849 where
3850 coerceFP2Int_sse2 = do
3851 (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
3852 let
3853 opc = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
3854 n -> panic $ "coerceFP2Init.sse: unhandled width ("
3855 ++ show n ++ ")"
3856 code dst = x_code `snocOL` opc (intFormat to) x_op dst
3857 return (Any (intFormat to) code)
3858 -- works even if the destination rep is <II32
3859
3860
3861 --------------------------------------------------------------------------------
3862 coerceFP2FP :: Width -> CmmExpr -> NatM Register
3863 coerceFP2FP to x = do
3864 (x_reg, x_code) <- getSomeReg x
3865 let
3866 opc = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
3867 n -> panic $ "coerceFP2FP: unhandled width ("
3868 ++ show n ++ ")"
3869 code dst = x_code `snocOL` opc x_reg dst
3870 return (Any ( floatFormat to) code)
3871
3872 --------------------------------------------------------------------------------
3873
3874 sse2NegCode :: Width -> CmmExpr -> NatM Register
3875 sse2NegCode w x = do
3876 let fmt = floatFormat w
3877 x_code <- getAnyReg x
3878 -- This is how gcc does it, so it can't be that bad:
3879 let
3880 const = case fmt of
3881 FF32 -> CmmInt 0x80000000 W32
3882 FF64 -> CmmInt 0x8000000000000000 W64
3883 x@II8 -> wrongFmt x
3884 x@II16 -> wrongFmt x
3885 x@II32 -> wrongFmt x
3886 x@II64 -> wrongFmt x
3887
3888 where
3889 wrongFmt x = panic $ "sse2NegCode: " ++ show x
3890 Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
3891 tmp <- getNewRegNat fmt
3892 let
3893 code dst = x_code dst `appOL` amode_code `appOL` toOL [
3894 MOV fmt (OpAddr amode) (OpReg tmp),
3895 XOR fmt (OpReg tmp) (OpReg dst)
3896 ]
3897 --
3898 return (Any fmt code)
3899
3900 isVecExpr :: CmmExpr -> Bool
3901 isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True
3902 isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
3903 isVecExpr (CmmMachOp (MO_V_Add {}) _) = True
3904 isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True
3905 isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True
3906 isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True
3907 isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True
3908 isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True
3909 isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True
3910 isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
3911 isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True
3912 isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True
3913 isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True
3914 isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True
3915 isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True
3916 isVecExpr (CmmMachOp _ [e]) = isVecExpr e
3917 isVecExpr _ = False
3918
3919 needLlvm :: NatM a
3920 needLlvm =
3921 sorry $ unlines ["The native code generator does not support vector"
3922 ,"instructions. Please use -fllvm."]
3923
3924 -- | This works on the invariant that all jumps in the given blocks are required.
3925 -- Starting from there we try to make a few more jumps redundant by reordering
3926 -- them.
3927 -- We depend on the information in the CFG to do so so without a given CFG
3928 -- we do nothing.
3929 invertCondBranches :: Maybe CFG -- ^ CFG if present
3930 -> LabelMap a -- ^ Blocks with info tables
3931 -> [NatBasicBlock Instr] -- ^ List of basic blocks
3932 -> [NatBasicBlock Instr]
3933 invertCondBranches Nothing _ bs = bs
3934 invertCondBranches (Just cfg) keep bs =
3935 invert bs
3936 where
3937 invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
3938 invert ((BasicBlock lbl1 ins@(_:_:_xs)):b2@(BasicBlock lbl2 _):bs)
3939 | --pprTrace "Block" (ppr lbl1) True,
3940 (jmp1,jmp2) <- last2 ins
3941 , JXX cond1 target1 <- jmp1
3942 , target1 == lbl2
3943 --, pprTrace "CutChance" (ppr b1) True
3944 , JXX ALWAYS target2 <- jmp2
3945 -- We have enough information to check if we can perform the inversion
3946 -- TODO: We could also check for the last asm instruction which sets
3947 -- status flags instead. Which I suspect is worse in terms of compiler
3948 -- performance, but might be applicable to more cases
3949 , Just edgeInfo1 <- getEdgeInfo lbl1 target1 cfg
3950 , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
3951 -- Both jumps come from the same cmm statement
3952 , transitionSource edgeInfo1 == transitionSource edgeInfo2
3953 , CmmSource {trans_cmmNode = cmmCondBranch} <- transitionSource edgeInfo1
3954
3955 --Int comparisons are invertable
3956 , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
3957 , Just _ <- maybeIntComparison op
3958 , Just invCond <- maybeInvertCond cond1
3959
3960 --Swap the last two jumps, invert the conditional jumps condition.
3961 = let jumps =
3962 case () of
3963 -- We are free the eliminate the jmp. So we do so.
3964 _ | not (mapMember target1 keep)
3965 -> [JXX invCond target2]
3966 -- If the conditional target is unlikely we put the other
3967 -- target at the front.
3968 | edgeWeight edgeInfo2 > edgeWeight edgeInfo1
3969 -> [JXX invCond target2, JXX ALWAYS target1]
3970 -- Keep things as-is otherwise
3971 | otherwise
3972 -> [jmp1, jmp2]
3973 in --pprTrace "Cutable" (ppr [jmp1,jmp2] <+> text "=>" <+> ppr jumps) $
3974 (BasicBlock lbl1
3975 (dropTail 2 ins ++ jumps))
3976 : invert (b2:bs)
3977 invert (b:bs) = b : invert bs
3978 invert [] = []