never executed always true always false
1 {-# LANGUAGE TypeFamilies #-}
2
3 -----------------------------------------------------------------------------
4 --
5 -- Machine-dependent assembly language
6 --
7 -- (c) The University of Glasgow 1993-2004
8 --
9 -----------------------------------------------------------------------------
10
11 module GHC.CmmToAsm.X86.Instr
12 ( Instr(..)
13 , Operand(..)
14 , PrefetchVariant(..)
15 , JumpDest(..)
16 , getJumpDestBlockId
17 , canShortcut
18 , shortcutStatics
19 , shortcutJump
20 , allocMoreStack
21 , maxSpillSlots
22 , archWordFormat
23 , takeRegRegMoveInstr
24 , regUsageOfInstr
25 , takeDeltaInstr
26 , mkLoadInstr
27 , mkJumpInstr
28 , mkStackAllocInstr
29 , mkStackDeallocInstr
30 , mkSpillInstr
31 , mkRegRegMoveInstr
32 , jumpDestsOfInstr
33 , patchRegsOfInstr
34 , patchJumpInstr
35 , isMetaInstr
36 , isJumpishInstr
37 )
38 where
39
40 import GHC.Prelude
41
42 import GHC.CmmToAsm.X86.Cond
43 import GHC.CmmToAsm.X86.Regs
44 import GHC.CmmToAsm.Format
45 import GHC.CmmToAsm.Types
46 import GHC.CmmToAsm.Utils
47 import GHC.CmmToAsm.Instr (RegUsage(..), noUsage)
48 import GHC.Platform.Reg.Class
49 import GHC.Platform.Reg
50 import GHC.CmmToAsm.Reg.Target
51 import GHC.CmmToAsm.Config
52
53 import GHC.Cmm.BlockId
54 import GHC.Cmm.Dataflow.Collections
55 import GHC.Cmm.Dataflow.Label
56 import GHC.Platform.Regs
57 import GHC.Cmm
58 import GHC.Utils.Outputable
59 import GHC.Utils.Panic
60 import GHC.Platform
61
62 import GHC.Cmm.CLabel
63 import GHC.Types.Unique.Set
64 import GHC.Types.Unique
65 import GHC.Types.Unique.Supply
66 import GHC.Types.Basic (Alignment)
67 import GHC.Cmm.DebugBlock (UnwindTable)
68
69 import Control.Monad
70 import Data.Maybe (fromMaybe)
71
72 -- Format of an x86/x86_64 memory address, in bytes.
73 --
74 archWordFormat :: Bool -> Format
75 archWordFormat is32Bit
76 | is32Bit = II32
77 | otherwise = II64
78
79 -- -----------------------------------------------------------------------------
80 -- Intel x86 instructions
81
82 {-
83 Intel, in their infinite wisdom, selected a stack model for floating
84 point registers on x86. That might have made sense back in 1979 --
85 nowadays we can see it for the nonsense it really is. A stack model
86 fits poorly with the existing nativeGen infrastructure, which assumes
87 flat integer and FP register sets. Prior to this commit, nativeGen
88 could not generate correct x86 FP code -- to do so would have meant
89 somehow working the register-stack paradigm into the register
90 allocator and spiller, which sounds very difficult.
91
92 We have decided to cheat, and go for a simple fix which requires no
93 infrastructure modifications, at the expense of generating ropey but
94 correct FP code. All notions of the x86 FP stack and its insns have
95 been removed. Instead, we pretend (to the instruction selector and
96 register allocator) that x86 has six floating point registers, %fake0
97 .. %fake5, which can be used in the usual flat manner. We further
98 claim that x86 has floating point instructions very similar to SPARC
99 and Alpha, that is, a simple 3-operand register-register arrangement.
100 Code generation and register allocation proceed on this basis.
101
102 When we come to print out the final assembly, our convenient fiction
103 is converted to dismal reality. Each fake instruction is
104 independently converted to a series of real x86 instructions.
105 %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
106 arithmetic operations, the two operands are pushed onto the top of the
107 FP stack, the operation done, and the result copied back into the
108 relevant register. There are only six %fake registers because 2 are
109 needed for the translation, and x86 has 8 in total.
110
111 The translation is inefficient but is simple and it works. A cleverer
112 translation would handle a sequence of insns, simulating the FP stack
113 contents, would not impose a fixed mapping from %fake to %st regs, and
114 hopefully could avoid most of the redundant reg-reg moves of the
115 current translation.
116
117 We might as well make use of whatever unique FP facilities Intel have
118 chosen to bless us with (let's not be churlish, after all).
119 Hence GLDZ and GLD1. Bwahahahahahahaha!
120 -}
121
122 {-
123 Note [x86 Floating point precision]
124
125 Intel's internal floating point registers are by default 80 bit
126 extended precision. This means that all operations done on values in
127 registers are done at 80 bits, and unless the intermediate values are
128 truncated to the appropriate size (32 or 64 bits) by storing in
129 memory, calculations in registers will give different results from
130 calculations which pass intermediate values in memory (eg. via
131 function calls).
132
133 One solution is to set the FPU into 64 bit precision mode. Some OSs
134 do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
135 that this will only affect 64-bit precision arithmetic; 32-bit
136 calculations will still be done at 64-bit precision in registers. So
137 it doesn't solve the whole problem.
138
139 There's also the issue of what the C library is expecting in terms of
140 precision. It seems to be the case that glibc on Linux expects the
141 FPU to be set to 80 bit precision, so setting it to 64 bit could have
142 unexpected effects. Changing the default could have undesirable
143 effects on other 3rd-party library code too, so the right thing would
144 be to save/restore the FPU control word across Haskell code if we were
145 to do this.
146
147 gcc's -ffloat-store gives consistent results by always storing the
148 results of floating-point calculations in memory, which works for both
149 32 and 64-bit precision. However, it only affects the values of
150 user-declared floating point variables in C, not intermediate results.
151 GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
152 flag).
153
154 Another problem is how to spill floating point registers in the
155 register allocator. Should we spill the whole 80 bits, or just 64?
156 On an OS which is set to 64 bit precision, spilling 64 is fine. On
157 Linux, spilling 64 bits will round the results of some operations.
158 This is what gcc does. Spilling at 80 bits requires taking up a full
159 128 bit slot (so we get alignment). We spill at 80-bits and ignore
160 the alignment problems.
161
162 In the future [edit: now available in GHC 7.0.1, with the -msse2
163 flag], we'll use the SSE registers for floating point. This requires
164 a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision
165 float ops), which means P4 or Xeon and above. Using SSE will solve
166 all these problems, because the SSE registers use fixed 32 bit or 64
167 bit precision.
168
169 --SDM 1/2003
170 -}
171
172 data Instr
173 -- comment pseudo-op
174 = COMMENT SDoc
175
176 -- location pseudo-op (file, line, col, name)
177 | LOCATION Int Int Int String
178
179 -- some static data spat out during code
180 -- generation. Will be extracted before
181 -- pretty-printing.
182 | LDATA Section (Alignment, RawCmmStatics)
183
184 -- start a new basic block. Useful during
185 -- codegen, removed later. Preceding
186 -- instruction should be a jump, as per the
187 -- invariants for a BasicBlock (see Cmm).
188 | NEWBLOCK BlockId
189
190 -- unwinding information
191 -- See Note [Unwinding information in the NCG].
192 | UNWIND CLabel UnwindTable
193
194 -- specify current stack offset for benefit of subsequent passes.
195 -- This carries a BlockId so it can be used in unwinding information.
196 | DELTA Int
197
198 -- Moves.
199 | MOV Format Operand Operand
200 | CMOV Cond Format Operand Reg
201 | MOVZxL Format Operand Operand
202 -- ^ The format argument is the size of operand 1 (the number of bits we keep)
203 -- We always zero *all* high bits, even though this isn't how the actual instruction
204 -- works. The code generator also seems to rely on this behaviour and it's faster
205 -- to execute on many cpus as well so for now I'm just documenting the fact.
206 | MOVSxL Format Operand Operand -- format is the size of operand 1
207 -- x86_64 note: plain mov into a 32-bit register always zero-extends
208 -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
209 -- don't affect the high bits of the register.
210
211 -- Load effective address (also a very useful three-operand add instruction :-)
212 | LEA Format Operand Operand
213
214 -- Int Arithmetic.
215 | ADD Format Operand Operand
216 | ADC Format Operand Operand
217 | SUB Format Operand Operand
218 | SBB Format Operand Operand
219
220 | MUL Format Operand Operand
221 | MUL2 Format Operand -- %edx:%eax = operand * %rax
222 | IMUL Format Operand Operand -- signed int mul
223 | IMUL2 Format Operand -- %edx:%eax = operand * %eax
224
225 | DIV Format Operand -- eax := eax:edx/op, edx := eax:edx%op
226 | IDIV Format Operand -- ditto, but signed
227
228 -- Int Arithmetic, where the effects on the condition register
229 -- are important. Used in specialized sequences such as MO_Add2.
230 -- Do not rewrite these instructions to "equivalent" ones that
231 -- have different effect on the condition register! (See #9013.)
232 | ADD_CC Format Operand Operand
233 | SUB_CC Format Operand Operand
234
235 -- Simple bit-twiddling.
236 | AND Format Operand Operand
237 | OR Format Operand Operand
238 | XOR Format Operand Operand
239 | NOT Format Operand
240 | NEGI Format Operand -- NEG instruction (name clash with Cond)
241 | BSWAP Format Reg
242
243 -- Shifts (amount may be immediate or %cl only)
244 | SHL Format Operand{-amount-} Operand
245 | SAR Format Operand{-amount-} Operand
246 | SHR Format Operand{-amount-} Operand
247
248 | BT Format Imm Operand
249 | NOP
250
251
252 -- We need to support the FSTP (x87 store and pop) instruction
253 -- so that we can correctly read off the return value of an
254 -- x86 CDECL C function call when its floating point.
255 -- so we dont include a register argument, and just use st(0)
256 -- this instruction is used ONLY for return values of C ffi calls
257 -- in x86_32 abi
258 | X87Store Format AddrMode -- st(0), dst
259
260
261 -- SSE2 floating point: we use a restricted set of the available SSE2
262 -- instructions for floating-point.
263 -- use MOV for moving (either movss or movsd (movlpd better?))
264 | CVTSS2SD Reg Reg -- F32 to F64
265 | CVTSD2SS Reg Reg -- F64 to F32
266 | CVTTSS2SIQ Format Operand Reg -- F32 to I32/I64 (with truncation)
267 | CVTTSD2SIQ Format Operand Reg -- F64 to I32/I64 (with truncation)
268 | CVTSI2SS Format Operand Reg -- I32/I64 to F32
269 | CVTSI2SD Format Operand Reg -- I32/I64 to F64
270
271 -- use ADD, SUB, and SQRT for arithmetic. In both cases, operands
272 -- are Operand Reg.
273
274 -- SSE2 floating-point division:
275 | FDIV Format Operand Operand -- divisor, dividend(dst)
276
277 -- use CMP for comparisons. ucomiss and ucomisd instructions
278 -- compare single/double prec floating point respectively.
279
280 | SQRT Format Operand Reg -- src, dst
281
282
283 -- Comparison
284 | TEST Format Operand Operand
285 | CMP Format Operand Operand
286 | SETCC Cond Operand
287
288 -- Stack Operations.
289 | PUSH Format Operand
290 | POP Format Operand
291 -- both unused (SDM):
292 -- | PUSHA
293 -- | POPA
294
295 -- Jumping around.
296 | JMP Operand [Reg] -- including live Regs at the call
297 | JXX Cond BlockId -- includes unconditional branches
298 | JXX_GBL Cond Imm -- non-local version of JXX
299 -- Table jump
300 | JMP_TBL Operand -- Address to jump to
301 [Maybe JumpDest] -- Targets of the jump table
302 Section -- Data section jump table should be put in
303 CLabel -- Label of jump table
304 -- | X86 call instruction
305 | CALL (Either Imm Reg) -- ^ Jump target
306 [Reg] -- ^ Arguments (required for register allocation)
307
308 -- Other things.
309 | CLTD Format -- sign extend %eax into %edx:%eax
310
311 | FETCHGOT Reg -- pseudo-insn for ELF position-independent code
312 -- pretty-prints as
313 -- call 1f
314 -- 1: popl %reg
315 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
316 | FETCHPC Reg -- pseudo-insn for Darwin position-independent code
317 -- pretty-prints as
318 -- call 1f
319 -- 1: popl %reg
320
321 -- bit counting instructions
322 | POPCNT Format Operand Reg -- [SSE4.2] count number of bits set to 1
323 | LZCNT Format Operand Reg -- [BMI2] count number of leading zeros
324 | TZCNT Format Operand Reg -- [BMI2] count number of trailing zeros
325 | BSF Format Operand Reg -- bit scan forward
326 | BSR Format Operand Reg -- bit scan reverse
327
328 -- bit manipulation instructions
329 | PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask
330 | PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
331
332 -- prefetch
333 | PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
334 -- variant can be NTA, Lvl0, Lvl1, or Lvl2
335
336 | LOCK Instr -- lock prefix
337 | XADD Format Operand Operand -- src (r), dst (r/m)
338 | CMPXCHG Format Operand Operand -- src (r), dst (r/m), eax implicit
339 | XCHG Format Operand Reg -- src (r/m), dst (r/m)
340 | MFENCE
341
342 data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
343
344
345 data Operand
346 = OpReg Reg -- register
347 | OpImm Imm -- immediate value
348 | OpAddr AddrMode -- memory reference
349
350
351
352 -- | Returns which registers are read and written as a (read, written)
353 -- pair.
354 regUsageOfInstr :: Platform -> Instr -> RegUsage
355 regUsageOfInstr platform instr
356 = case instr of
357 MOV _ src dst -> usageRW src dst
358 CMOV _ _ src dst -> mkRU (use_R src [dst]) [dst]
359 MOVZxL _ src dst -> usageRW src dst
360 MOVSxL _ src dst -> usageRW src dst
361 LEA _ src dst -> usageRW src dst
362 ADD _ src dst -> usageRM src dst
363 ADC _ src dst -> usageRM src dst
364 SUB _ src dst -> usageRM src dst
365 SBB _ src dst -> usageRM src dst
366 IMUL _ src dst -> usageRM src dst
367
368 -- Result of IMULB will be in just in %ax
369 IMUL2 II8 src -> mkRU (eax:use_R src []) [eax]
370 -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and
371 -- %ax/%eax/%rax.
372 IMUL2 _ src -> mkRU (eax:use_R src []) [eax,edx]
373
374 MUL _ src dst -> usageRM src dst
375 MUL2 _ src -> mkRU (eax:use_R src []) [eax,edx]
376 DIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
377 IDIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
378 ADD_CC _ src dst -> usageRM src dst
379 SUB_CC _ src dst -> usageRM src dst
380 AND _ src dst -> usageRM src dst
381 OR _ src dst -> usageRM src dst
382
383 XOR _ (OpReg src) (OpReg dst)
384 | src == dst -> mkRU [] [dst]
385
386 XOR _ src dst -> usageRM src dst
387 NOT _ op -> usageM op
388 BSWAP _ reg -> mkRU [reg] [reg]
389 NEGI _ op -> usageM op
390 SHL _ imm dst -> usageRM imm dst
391 SAR _ imm dst -> usageRM imm dst
392 SHR _ imm dst -> usageRM imm dst
393 BT _ _ src -> mkRUR (use_R src [])
394
395 PUSH _ op -> mkRUR (use_R op [])
396 POP _ op -> mkRU [] (def_W op)
397 TEST _ src dst -> mkRUR (use_R src $! use_R dst [])
398 CMP _ src dst -> mkRUR (use_R src $! use_R dst [])
399 SETCC _ op -> mkRU [] (def_W op)
400 JXX _ _ -> mkRU [] []
401 JXX_GBL _ _ -> mkRU [] []
402 JMP op regs -> mkRUR (use_R op regs)
403 JMP_TBL op _ _ _ -> mkRUR (use_R op [])
404 CALL (Left _) params -> mkRU params (callClobberedRegs platform)
405 CALL (Right reg) params -> mkRU (reg:params) (callClobberedRegs platform)
406 CLTD _ -> mkRU [eax] [edx]
407 NOP -> mkRU [] []
408
409 X87Store _ dst -> mkRUR ( use_EA dst [])
410
411 CVTSS2SD src dst -> mkRU [src] [dst]
412 CVTSD2SS src dst -> mkRU [src] [dst]
413 CVTTSS2SIQ _ src dst -> mkRU (use_R src []) [dst]
414 CVTTSD2SIQ _ src dst -> mkRU (use_R src []) [dst]
415 CVTSI2SS _ src dst -> mkRU (use_R src []) [dst]
416 CVTSI2SD _ src dst -> mkRU (use_R src []) [dst]
417 FDIV _ src dst -> usageRM src dst
418 SQRT _ src dst -> mkRU (use_R src []) [dst]
419
420 FETCHGOT reg -> mkRU [] [reg]
421 FETCHPC reg -> mkRU [] [reg]
422
423 COMMENT _ -> noUsage
424 LOCATION{} -> noUsage
425 UNWIND{} -> noUsage
426 DELTA _ -> noUsage
427
428 POPCNT _ src dst -> mkRU (use_R src []) [dst]
429 LZCNT _ src dst -> mkRU (use_R src []) [dst]
430 TZCNT _ src dst -> mkRU (use_R src []) [dst]
431 BSF _ src dst -> mkRU (use_R src []) [dst]
432 BSR _ src dst -> mkRU (use_R src []) [dst]
433
434 PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
435 PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
436
437 -- note: might be a better way to do this
438 PREFETCH _ _ src -> mkRU (use_R src []) []
439 LOCK i -> regUsageOfInstr platform i
440 XADD _ src dst -> usageMM src dst
441 CMPXCHG _ src dst -> usageRMM src dst (OpReg eax)
442 XCHG _ src dst -> usageMM src (OpReg dst)
443 MFENCE -> noUsage
444
445 _other -> panic "regUsage: unrecognised instr"
446 where
447 -- # Definitions
448 --
449 -- Written: If the operand is a register, it's written. If it's an
450 -- address, registers mentioned in the address are read.
451 --
452 -- Modified: If the operand is a register, it's both read and
453 -- written. If it's an address, registers mentioned in the address
454 -- are read.
455
456 -- 2 operand form; first operand Read; second Written
457 usageRW :: Operand -> Operand -> RegUsage
458 usageRW op (OpReg reg) = mkRU (use_R op []) [reg]
459 usageRW op (OpAddr ea) = mkRUR (use_R op $! use_EA ea [])
460 usageRW _ _ = panic "X86.RegInfo.usageRW: no match"
461
462 -- 2 operand form; first operand Read; second Modified
463 usageRM :: Operand -> Operand -> RegUsage
464 usageRM op (OpReg reg) = mkRU (use_R op [reg]) [reg]
465 usageRM op (OpAddr ea) = mkRUR (use_R op $! use_EA ea [])
466 usageRM _ _ = panic "X86.RegInfo.usageRM: no match"
467
468 -- 2 operand form; first operand Modified; second Modified
469 usageMM :: Operand -> Operand -> RegUsage
470 usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
471 usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
472 usageMM (OpAddr ea) (OpReg dst) = mkRU (use_EA ea [dst]) [dst]
473 usageMM _ _ = panic "X86.RegInfo.usageMM: no match"
474
475 -- 3 operand form; first operand Read; second Modified; third Modified
476 usageRMM :: Operand -> Operand -> Operand -> RegUsage
477 usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
478 usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
479 usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match"
480
481 -- 1 operand form; operand Modified
482 usageM :: Operand -> RegUsage
483 usageM (OpReg reg) = mkRU [reg] [reg]
484 usageM (OpAddr ea) = mkRUR (use_EA ea [])
485 usageM _ = panic "X86.RegInfo.usageM: no match"
486
487 -- Registers defd when an operand is written.
488 def_W (OpReg reg) = [reg]
489 def_W (OpAddr _ ) = []
490 def_W _ = panic "X86.RegInfo.def_W: no match"
491
492 -- Registers used when an operand is read.
493 use_R (OpReg reg) tl = reg : tl
494 use_R (OpImm _) tl = tl
495 use_R (OpAddr ea) tl = use_EA ea tl
496
497 -- Registers used to compute an effective address.
498 use_EA (ImmAddr _ _) tl = tl
499 use_EA (AddrBaseIndex base index _) tl =
500 use_base base $! use_index index tl
501 where use_base (EABaseReg r) tl = r : tl
502 use_base _ tl = tl
503 use_index EAIndexNone tl = tl
504 use_index (EAIndex i _) tl = i : tl
505
506 mkRUR src = src' `seq` RU src' []
507 where src' = filter (interesting platform) src
508
509 mkRU src dst = src' `seq` dst' `seq` RU src' dst'
510 where src' = filter (interesting platform) src
511 dst' = filter (interesting platform) dst
512
513 -- | Is this register interesting for the register allocator?
514 interesting :: Platform -> Reg -> Bool
515 interesting _ (RegVirtual _) = True
516 interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
517 interesting _ (RegReal (RealRegPair{})) = panic "X86.interesting: no reg pairs on this arch"
518
519
520
521 -- | Applies the supplied function to all registers in instructions.
522 -- Typically used to change virtual registers to real registers.
523 patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
524 patchRegsOfInstr instr env
525 = case instr of
526 MOV fmt src dst -> patch2 (MOV fmt) src dst
527 CMOV cc fmt src dst -> CMOV cc fmt (patchOp src) (env dst)
528 MOVZxL fmt src dst -> patch2 (MOVZxL fmt) src dst
529 MOVSxL fmt src dst -> patch2 (MOVSxL fmt) src dst
530 LEA fmt src dst -> patch2 (LEA fmt) src dst
531 ADD fmt src dst -> patch2 (ADD fmt) src dst
532 ADC fmt src dst -> patch2 (ADC fmt) src dst
533 SUB fmt src dst -> patch2 (SUB fmt) src dst
534 SBB fmt src dst -> patch2 (SBB fmt) src dst
535 IMUL fmt src dst -> patch2 (IMUL fmt) src dst
536 IMUL2 fmt src -> patch1 (IMUL2 fmt) src
537 MUL fmt src dst -> patch2 (MUL fmt) src dst
538 MUL2 fmt src -> patch1 (MUL2 fmt) src
539 IDIV fmt op -> patch1 (IDIV fmt) op
540 DIV fmt op -> patch1 (DIV fmt) op
541 ADD_CC fmt src dst -> patch2 (ADD_CC fmt) src dst
542 SUB_CC fmt src dst -> patch2 (SUB_CC fmt) src dst
543 AND fmt src dst -> patch2 (AND fmt) src dst
544 OR fmt src dst -> patch2 (OR fmt) src dst
545 XOR fmt src dst -> patch2 (XOR fmt) src dst
546 NOT fmt op -> patch1 (NOT fmt) op
547 BSWAP fmt reg -> BSWAP fmt (env reg)
548 NEGI fmt op -> patch1 (NEGI fmt) op
549 SHL fmt imm dst -> patch1 (SHL fmt imm) dst
550 SAR fmt imm dst -> patch1 (SAR fmt imm) dst
551 SHR fmt imm dst -> patch1 (SHR fmt imm) dst
552 BT fmt imm src -> patch1 (BT fmt imm) src
553 TEST fmt src dst -> patch2 (TEST fmt) src dst
554 CMP fmt src dst -> patch2 (CMP fmt) src dst
555 PUSH fmt op -> patch1 (PUSH fmt) op
556 POP fmt op -> patch1 (POP fmt) op
557 SETCC cond op -> patch1 (SETCC cond) op
558 JMP op regs -> JMP (patchOp op) regs
559 JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
560
561 -- literally only support storing the top x87 stack value st(0)
562 X87Store fmt dst -> X87Store fmt (lookupAddr dst)
563
564 CVTSS2SD src dst -> CVTSS2SD (env src) (env dst)
565 CVTSD2SS src dst -> CVTSD2SS (env src) (env dst)
566 CVTTSS2SIQ fmt src dst -> CVTTSS2SIQ fmt (patchOp src) (env dst)
567 CVTTSD2SIQ fmt src dst -> CVTTSD2SIQ fmt (patchOp src) (env dst)
568 CVTSI2SS fmt src dst -> CVTSI2SS fmt (patchOp src) (env dst)
569 CVTSI2SD fmt src dst -> CVTSI2SD fmt (patchOp src) (env dst)
570 FDIV fmt src dst -> FDIV fmt (patchOp src) (patchOp dst)
571 SQRT fmt src dst -> SQRT fmt (patchOp src) (env dst)
572
573 CALL (Left _) _ -> instr
574 CALL (Right reg) p -> CALL (Right (env reg)) p
575
576 FETCHGOT reg -> FETCHGOT (env reg)
577 FETCHPC reg -> FETCHPC (env reg)
578
579 NOP -> instr
580 COMMENT _ -> instr
581 LOCATION {} -> instr
582 UNWIND {} -> instr
583 DELTA _ -> instr
584
585 JXX _ _ -> instr
586 JXX_GBL _ _ -> instr
587 CLTD _ -> instr
588
589 POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
590 LZCNT fmt src dst -> LZCNT fmt (patchOp src) (env dst)
591 TZCNT fmt src dst -> TZCNT fmt (patchOp src) (env dst)
592 PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst)
593 PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst)
594 BSF fmt src dst -> BSF fmt (patchOp src) (env dst)
595 BSR fmt src dst -> BSR fmt (patchOp src) (env dst)
596
597 PREFETCH lvl format src -> PREFETCH lvl format (patchOp src)
598
599 LOCK i -> LOCK (patchRegsOfInstr i env)
600 XADD fmt src dst -> patch2 (XADD fmt) src dst
601 CMPXCHG fmt src dst -> patch2 (CMPXCHG fmt) src dst
602 XCHG fmt src dst -> XCHG fmt (patchOp src) (env dst)
603 MFENCE -> instr
604
605 _other -> panic "patchRegs: unrecognised instr"
606
607 where
608 patch1 :: (Operand -> a) -> Operand -> a
609 patch1 insn op = insn $! patchOp op
610 patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a
611 patch2 insn src dst = (insn $! patchOp src) $! patchOp dst
612
613 patchOp (OpReg reg) = OpReg $! env reg
614 patchOp (OpImm imm) = OpImm imm
615 patchOp (OpAddr ea) = OpAddr $! lookupAddr ea
616
617 lookupAddr (ImmAddr imm off) = ImmAddr imm off
618 lookupAddr (AddrBaseIndex base index disp)
619 = ((AddrBaseIndex $! lookupBase base) $! lookupIndex index) disp
620 where
621 lookupBase EABaseNone = EABaseNone
622 lookupBase EABaseRip = EABaseRip
623 lookupBase (EABaseReg r) = EABaseReg $! env r
624
625 lookupIndex EAIndexNone = EAIndexNone
626 lookupIndex (EAIndex r i) = (EAIndex $! env r) i
627
628
629 --------------------------------------------------------------------------------
630 isJumpishInstr
631 :: Instr -> Bool
632
633 isJumpishInstr instr
634 = case instr of
635 JMP{} -> True
636 JXX{} -> True
637 JXX_GBL{} -> True
638 JMP_TBL{} -> True
639 CALL{} -> True
640 _ -> False
641
642
643 jumpDestsOfInstr
644 :: Instr
645 -> [BlockId]
646
647 jumpDestsOfInstr insn
648 = case insn of
649 JXX _ id -> [id]
650 JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids]
651 _ -> []
652
653
654 patchJumpInstr
655 :: Instr -> (BlockId -> BlockId) -> Instr
656
657 patchJumpInstr insn patchF
658 = case insn of
659 JXX cc id -> JXX cc (patchF id)
660 JMP_TBL op ids section lbl
661 -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl
662 _ -> insn
663 where
664 patchJumpDest f (DestBlockId id) = DestBlockId (f id)
665 patchJumpDest _ dest = dest
666
667
668
669
670
671 -- -----------------------------------------------------------------------------
672 -- | Make a spill instruction.
673 mkSpillInstr
674 :: NCGConfig
675 -> Reg -- register to spill
676 -> Int -- current stack delta
677 -> Int -- spill slot to use
678 -> [Instr]
679
680 mkSpillInstr config reg delta slot
681 = let off = spillSlotToOffset platform slot - delta
682 in
683 case targetClassOfReg platform reg of
684 RcInteger -> [MOV (archWordFormat is32Bit)
685 (OpReg reg) (OpAddr (spRel platform off))]
686 RcDouble -> [MOV FF64 (OpReg reg) (OpAddr (spRel platform off))]
687 _ -> panic "X86.mkSpillInstr: no match"
688 where platform = ncgPlatform config
689 is32Bit = target32Bit platform
690
691 -- | Make a spill reload instruction.
692 mkLoadInstr
693 :: NCGConfig
694 -> Reg -- register to load
695 -> Int -- current stack delta
696 -> Int -- spill slot to use
697 -> [Instr]
698
699 mkLoadInstr config reg delta slot
700 = let off = spillSlotToOffset platform slot - delta
701 in
702 case targetClassOfReg platform reg of
703 RcInteger -> ([MOV (archWordFormat is32Bit)
704 (OpAddr (spRel platform off)) (OpReg reg)])
705 RcDouble -> ([MOV FF64 (OpAddr (spRel platform off)) (OpReg reg)])
706 _ -> panic "X86.mkLoadInstr"
707 where platform = ncgPlatform config
708 is32Bit = target32Bit platform
709
710 spillSlotSize :: Platform -> Int
711 spillSlotSize platform
712 | target32Bit platform = 12
713 | otherwise = 8
714
715 maxSpillSlots :: NCGConfig -> Int
716 maxSpillSlots config
717 = ((ncgSpillPreallocSize config - 64) `div` spillSlotSize (ncgPlatform config)) - 1
718 -- = 0 -- useful for testing allocMoreStack
719
720 -- number of bytes that the stack pointer should be aligned to
721 stackAlign :: Int
722 stackAlign = 16
723
724 -- convert a spill slot number to a *byte* offset, with no sign:
725 -- decide on a per arch basis whether you are spilling above or below
726 -- the C stack pointer.
727 spillSlotToOffset :: Platform -> Int -> Int
728 spillSlotToOffset platform slot
729 = 64 + spillSlotSize platform * slot
730
731 --------------------------------------------------------------------------------
732
733 -- | See if this instruction is telling us the current C stack delta
734 takeDeltaInstr
735 :: Instr
736 -> Maybe Int
737
738 takeDeltaInstr instr
739 = case instr of
740 DELTA i -> Just i
741 _ -> Nothing
742
743
744 isMetaInstr
745 :: Instr
746 -> Bool
747
748 isMetaInstr instr
749 = case instr of
750 COMMENT{} -> True
751 LOCATION{} -> True
752 LDATA{} -> True
753 NEWBLOCK{} -> True
754 UNWIND{} -> True
755 DELTA{} -> True
756 _ -> False
757
758
759
760 --- TODO: why is there
761 -- | Make a reg-reg move instruction.
762 -- On SPARC v8 there are no instructions to move directly between
763 -- floating point and integer regs. If we need to do that then we
764 -- have to go via memory.
765 --
766 mkRegRegMoveInstr
767 :: Platform
768 -> Reg
769 -> Reg
770 -> Instr
771
772 mkRegRegMoveInstr platform src dst
773 = case targetClassOfReg platform src of
774 RcInteger -> case platformArch platform of
775 ArchX86 -> MOV II32 (OpReg src) (OpReg dst)
776 ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst)
777 _ -> panic "X86.mkRegRegMoveInstr: Bad arch"
778 RcDouble -> MOV FF64 (OpReg src) (OpReg dst)
779 -- this code is the lie we tell ourselves because both float and double
780 -- use the same register class.on x86_64 and x86 32bit with SSE2,
781 -- more plainly, both use the XMM registers
782 _ -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
783
784 -- | Check whether an instruction represents a reg-reg move.
785 -- The register allocator attempts to eliminate reg->reg moves whenever it can,
786 -- by assigning the src and dest temporaries to the same real register.
787 --
788 takeRegRegMoveInstr
789 :: Instr
790 -> Maybe (Reg,Reg)
791
792 takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2))
793 = Just (r1,r2)
794
795 takeRegRegMoveInstr _ = Nothing
796
797
798 -- | Make an unconditional branch instruction.
799 mkJumpInstr
800 :: BlockId
801 -> [Instr]
802
803 mkJumpInstr id
804 = [JXX ALWAYS id]
805
806 -- Note [Windows stack layout]
807 -- | On most OSes the kernel will place a guard page after the current stack
808 -- page. If you allocate larger than a page worth you may jump over this
809 -- guard page. Not only is this a security issue, but on certain OSes such
810 -- as Windows a new page won't be allocated if you don't hit the guard. This
811 -- will cause a segfault or access fault.
812 --
813 -- This function defines if the current allocation amount requires a probe.
814 -- On Windows (for now) we emit a call to _chkstk for this. For other OSes
815 -- this is not yet implemented.
816 -- See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk
817 -- The Windows stack looks like this:
818 --
819 -- +-------------------+
820 -- | SP |
821 -- +-------------------+
822 -- | |
823 -- | GUARD PAGE |
824 -- | |
825 -- +-------------------+
826 -- | |
827 -- | |
828 -- | UNMAPPED |
829 -- | |
830 -- | |
831 -- +-------------------+
832 --
833 -- In essence each allocation larger than a page size needs to be chunked and
834 -- a probe emitted after each page allocation. You have to hit the guard
835 -- page so the kernel can map in the next page, otherwise you'll segfault.
836 -- See Note [Windows stack allocations].
837 --
838 needs_probe_call :: Platform -> Int -> Bool
839 needs_probe_call platform amount
840 = case platformOS platform of
841 OSMinGW32 -> case platformArch platform of
842 ArchX86 -> amount > (4 * 1024)
843 ArchX86_64 -> amount > (4 * 1024)
844 _ -> False
845 _ -> False
846
847 mkStackAllocInstr
848 :: Platform
849 -> Int
850 -> [Instr]
851 mkStackAllocInstr platform amount
852 = case platformOS platform of
853 OSMinGW32 ->
854 -- These will clobber AX but this should be ok because
855 --
856 -- 1. It is the first thing we do when entering the closure and AX is
857 -- a caller saved registers on Windows both on x86_64 and x86.
858 --
859 -- 2. The closures are only entered via a call or longjmp in which case
860 -- there are no expectations for volatile registers.
861 --
862 -- 3. When the target is a local branch point it is re-targeted
863 -- after the dealloc, preserving #2. See note [extra spill slots].
864 --
865 -- We emit a call because the stack probes are quite involved and
866 -- would bloat code size a lot. GHC doesn't really have an -Os.
867 -- ___chkstk is guaranteed to leave all nonvolatile registers and AX
868 -- untouched. It's part of the standard prologue code for any Windows
869 -- function dropping the stack more than a page.
870 -- See Note [Windows stack layout]
871 case platformArch platform of
872 ArchX86 | needs_probe_call platform amount ->
873 [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax)
874 , CALL (Left $ strImmLit "___chkstk_ms") [eax]
875 , SUB II32 (OpReg eax) (OpReg esp)
876 ]
877 | otherwise ->
878 [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
879 , TEST II32 (OpReg esp) (OpReg esp)
880 ]
881 ArchX86_64 | needs_probe_call platform amount ->
882 [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax)
883 , CALL (Left $ strImmLit "___chkstk_ms") [rax]
884 , SUB II64 (OpReg rax) (OpReg rsp)
885 ]
886 | otherwise ->
887 [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
888 , TEST II64 (OpReg rsp) (OpReg rsp)
889 ]
890 _ -> panic "X86.mkStackAllocInstr"
891 _ ->
892 case platformArch platform of
893 ArchX86 -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ]
894 ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ]
895 _ -> panic "X86.mkStackAllocInstr"
896
897 mkStackDeallocInstr
898 :: Platform
899 -> Int
900 -> [Instr]
901 mkStackDeallocInstr platform amount
902 = case platformArch platform of
903 ArchX86 -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)]
904 ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)]
905 _ -> panic "X86.mkStackDeallocInstr"
906
907
908 --
909 -- Note [extra spill slots]
910 --
911 -- If the register allocator used more spill slots than we have
912 -- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more
913 -- C stack space on entry and exit from this proc. Therefore we
914 -- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp"
915 -- before every non-local jump.
916 --
917 -- This became necessary when the new codegen started bundling entire
918 -- functions together into one proc, because the register allocator
919 -- assigns a different stack slot to each virtual reg within a proc.
920 -- To avoid using so many slots we could also:
921 --
922 -- - split up the proc into connected components before code generator
923 --
924 -- - rename the virtual regs, so that we re-use vreg names and hence
925 -- stack slots for non-overlapping vregs.
926 --
927 -- Note that when a block is both a non-local entry point (with an
928 -- info table) and a local branch target, we have to split it into
929 -- two, like so:
930 --
931 -- <info table>
932 -- L:
933 -- <code>
934 --
935 -- becomes
936 --
937 -- <info table>
938 -- L:
939 -- subl $rsp, N
940 -- jmp Lnew
941 -- Lnew:
942 -- <code>
943 --
944 -- and all branches pointing to L are retargetted to point to Lnew.
945 -- Otherwise, we would repeat the $rsp adjustment for each branch to
946 -- L.
947 --
948 -- Returns a list of (L,Lnew) pairs.
949 --
950 allocMoreStack
951 :: Platform
952 -> Int
953 -> NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr
954 -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr, [(BlockId,BlockId)])
955
956 allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
957 allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
958 let entries = entryBlocks proc
959
960 uniqs <- replicateM (length entries) getUniqueM
961
962 let
963 delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
964 where x = slots * spillSlotSize platform -- sp delta
965
966 alloc = mkStackAllocInstr platform delta
967 dealloc = mkStackDeallocInstr platform delta
968
969 retargetList = (zip entries (map mkBlockId uniqs))
970
971 new_blockmap :: LabelMap BlockId
972 new_blockmap = mapFromList retargetList
973
974 insert_stack_insns (BasicBlock id insns)
975 | Just new_blockid <- mapLookup id new_blockmap
976 = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid]
977 , BasicBlock new_blockid block' ]
978 | otherwise
979 = [ BasicBlock id block' ]
980 where
981 block' = foldr insert_dealloc [] insns
982
983 insert_dealloc insn r = case insn of
984 JMP _ _ -> dealloc ++ (insn : r)
985 JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL"
986 _other -> patchJumpInstr insn retarget : r
987 where retarget b = fromMaybe b (mapLookup b new_blockmap)
988
989 new_code = concatMap insert_stack_insns code
990 -- in
991 return (CmmProc info lbl live (ListGraph new_code), retargetList)
992
993 data JumpDest = DestBlockId BlockId | DestImm Imm
994
995 -- Debug Instance
996 instance Outputable JumpDest where
997 ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
998 ppr (DestImm _imm) = text "jd<imm>:noShow"
999
1000
1001 getJumpDestBlockId :: JumpDest -> Maybe BlockId
1002 getJumpDestBlockId (DestBlockId bid) = Just bid
1003 getJumpDestBlockId _ = Nothing
1004
1005 canShortcut :: Instr -> Maybe JumpDest
1006 canShortcut (JXX ALWAYS id) = Just (DestBlockId id)
1007 canShortcut (JMP (OpImm imm) _) = Just (DestImm imm)
1008 canShortcut _ = Nothing
1009
1010
1011 -- This helper shortcuts a sequence of branches.
1012 -- The blockset helps avoid following cycles.
1013 shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
1014 shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn
1015 where
1016 shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr
1017 shortcutJump' fn seen insn@(JXX cc id) =
1018 if setMember id seen then insn
1019 else case fn id of
1020 Nothing -> insn
1021 Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id')
1022 Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm)
1023 where seen' = setInsert id seen
1024 shortcutJump' fn _ (JMP_TBL addr blocks section tblId) =
1025 let updateBlock (Just (DestBlockId bid)) =
1026 case fn bid of
1027 Nothing -> Just (DestBlockId bid )
1028 Just dest -> Just dest
1029 updateBlock dest = dest
1030 blocks' = map updateBlock blocks
1031 in JMP_TBL addr blocks' section tblId
1032 shortcutJump' _ _ other = other
1033
1034 -- Here because it knows about JumpDest
1035 shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, RawCmmStatics) -> (Alignment, RawCmmStatics)
1036 shortcutStatics fn (align, CmmStaticsRaw lbl statics)
1037 = (align, CmmStaticsRaw lbl $ map (shortcutStatic fn) statics)
1038 -- we need to get the jump tables, so apply the mapping to the entries
1039 -- of a CmmData too.
1040
1041 shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
1042 shortcutLabel fn lab
1043 | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId
1044 | otherwise = lab
1045
1046 shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
1047 shortcutStatic fn (CmmStaticLit (CmmLabel lab))
1048 = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
1049 shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
1050 = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
1051 -- slightly dodgy, we're ignoring the second label, but this
1052 -- works with the way we use CmmLabelDiffOff for jump tables now.
1053 shortcutStatic _ other_static
1054 = other_static
1055
1056 shortBlockId
1057 :: (BlockId -> Maybe JumpDest)
1058 -> UniqSet Unique
1059 -> BlockId
1060 -> CLabel
1061
1062 shortBlockId fn seen blockid =
1063 case (elementOfUniqSet uq seen, fn blockid) of
1064 (True, _) -> blockLbl blockid
1065 (_, Nothing) -> blockLbl blockid
1066 (_, Just (DestBlockId blockid')) -> shortBlockId fn (addOneToUniqSet seen uq) blockid'
1067 (_, Just (DestImm (ImmCLbl lbl))) -> lbl
1068 (_, _other) -> panic "shortBlockId"
1069 where uq = getUnique blockid