//=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the machine model for AMD bdver2 (Piledriver) to support // instruction scheduling and other instruction cost heuristics. // Based on: // * AMD Software Optimization Guide for AMD Family 15h Processors. // https://support.amd.com/TechDocs/47414_15h_sw_opt_guide.pdf // * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog // http://www.agner.org/optimize/microarchitecture.pdf // * https://www.realworldtech.com/bulldozer/ // Yes, that is for Bulldozer aka bdver1, not Piledriver aka bdver2. // //===----------------------------------------------------------------------===// def BdVer2Model : SchedMachineModel { let IssueWidth = 4; // Up to 4 IPC can be decoded, issued, retired. let MicroOpBufferSize = 128; // RCU reorder buffer size, which is unconfirmed. let LoopMicroOpBufferSize = -1; // There does not seem to be a loop buffer. let LoadLatency = 4; // L1 data cache has a 4-cycle load-to-use latency. let HighLatency = 25; // FIXME: any better choice? let MispredictPenalty = 20; // Minimum branch misdirection penalty. let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. // FIXME: Incomplete. This flag is set to allow the scheduler to assign // a default model to unrecognized opcodes. let CompleteModel = 0; } // SchedMachineModel let SchedModel = BdVer2Model in { //===----------------------------------------------------------------------===// // Pipes //===----------------------------------------------------------------------===// // There are total of eight pipes. //===----------------------------------------------------------------------===// // Integer execution pipes // // Two EX (ALU) pipes. def PdEX0 : ProcResource<1>; // ALU, Integer Pipe0 def PdEX1 : ProcResource<1>; // ALU, Integer Pipe1 def PdEX01 : ProcResGroup<[PdEX0, PdEX1]>; // Two AGLU pipes, identical. def PdAGLU01 : ProcResource<2>; // AGU, Integer Pipe[23] //===----------------------------------------------------------------------===// // Floating point execution pipes // // Four FPU pipes. def PdFPU0 : ProcResource<1>; // Vector/FPU Pipe0 def PdFPU1 : ProcResource<1>; // Vector/FPU Pipe1 def PdFPU2 : ProcResource<1>; // Vector/FPU Pipe2 def PdFPU3 : ProcResource<1>; // Vector/FPU Pipe3 // FPU grouping def PdFPU01 : ProcResGroup<[PdFPU0, PdFPU1]>; def PdFPU23 : ProcResGroup<[PdFPU2, PdFPU3]>; //===----------------------------------------------------------------------===// // RCU //===----------------------------------------------------------------------===// // The Retire Control Unit on Piledriver can retire up to 4 macro-ops per cycle. // On the other hand, the RCU reorder buffer size for Piledriver does not // seem be specified in any trustworthy source. // But as per https://www.realworldtech.com/bulldozer/6/ the Bulldozer had // RCU reorder buffer size of 128. So that is a good guess for now. def PdRCU : RetireControlUnit<128, 4>; //===----------------------------------------------------------------------===// // Pipelines //===----------------------------------------------------------------------===// // There are total of two pipelines, each one with it's own scheduler. //===----------------------------------------------------------------------===// // Integer Pipeline Scheduling // // There is one Integer Scheduler per core. // Integer physical register file has 96 registers of 64-bit. def PdIntegerPRF : RegisterFile<96, [GR64, CCR]>; // Unified Integer, Memory Scheduler has 40 entries. def PdEX : ProcResGroup<[PdEX0, PdEX1, PdAGLU01]> { // Up to 4 IPC can be decoded, issued, retired. let BufferSize = 40; } //===----------------------------------------------------------------------===// // FPU Pipeline Scheduling // // The FPU unit is shared between the two cores. // FP physical register file has 160 registers of 128-bit. // Operations on 256-bit data types are cracked into two COPs. def PdFpuPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>; // Unified FP Scheduler has 64 entries, def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> { // Up to 4 IPC can be decoded, issued, retired. let BufferSize = 64; } //===----------------------------------------------------------------------===// // Functional units //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Load-Store Units // let Super = PdAGLU01 in def PdLoad : ProcResource<2> { // For Piledriver, the load queue is 40 entries deep. let BufferSize = 40; } def PdLoadQueue : LoadQueue; let Super = PdAGLU01 in def PdStore : ProcResource<1> { // For Piledriver, the store queue is 24 entries deep. let BufferSize = 24; } def PdStoreQueue : StoreQueue; //===----------------------------------------------------------------------===// // Integer Execution Units // def PdDiv : ProcResource<1>; // PdEX0; unpipelined integer division def PdCount : ProcResource<1>; // PdEX0; POPCNT, LZCOUNT def PdMul : ProcResource<1>; // PdEX1; integer multiplication def PdBranch : ProcResource<1>; // PdEX1; JMP, fused branches //===----------------------------------------------------------------------===// // Floating-Point Units // // Two FMAC/FPFMA units. def PdFPFMA : ProcResource<2>; // PdFPU0, PdFPU1 // One 128-bit integer multiply-accumulate unit. def PdFPMMA : ProcResource<1>; // PdFPU0 // One fp conversion unit. def PdFPCVT : ProcResource<1>; // PdFPU0 // One unit for shuffles, packs, permutes, shifts. def PdFPXBR : ProcResource<1>; // PdFPU1 // Two 128-bit packed integer units. def PdFPMAL : ProcResource<2>; // PdFPU2, PdFPU3 // One FP store unit. def PdFPSTO : ProcResource<1>; // PdFPU3 //===----------------------------------------------------------------------===// // Basic helper classes. //===----------------------------------------------------------------------===// // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. // This multiclass defines the resource usage for variants with and without // folded loads. multiclass PdWriteRes ExePorts, int Lat = 1, list Res = [], int UOps = 1> { def : WriteRes { let Latency = Lat; let ResourceCycles = Res; let NumMicroOps = UOps; } } multiclass __pdWriteResPair ExePorts, int Lat, list Res, int UOps, int LoadLat, int LoadRes, int LoadUOps> { defm : PdWriteRes; defm : PdWriteRes; } multiclass PdWriteResExPair ExePorts, int Lat = 1, list Res = [], int UOps = 1, int LoadUOps = 0> { defm : __pdWriteResPair; } multiclass PdWriteResXMMPair ExePorts, int Lat = 1, list Res = [], int UOps = 1, int LoadUOps = 0> { defm : __pdWriteResPair; } multiclass PdWriteResYMMPair ExePorts, int Lat, list Res, int UOps = 2, int LoadUOps = 0> { defm : __pdWriteResPair; } //===----------------------------------------------------------------------===// // Here be dragons. //===----------------------------------------------------------------------===// // L1 data cache has a 4-cycle load-to-use latency, so ReadAfterLd registers // needn't be available until 4 cycles after the memory operand. def : ReadAdvance; // Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available // until 5 cycles after the memory operand. def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; // A folded store needs a cycle on the PdStore for the store data. def : WriteRes; //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. //////////////////////////////////////////////////////////////////////////////// def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; def : WriteRes; // Load/store MXCSR. // FIXME: These are copy and pasted from WriteLoad/Store. def : WriteRes { let Latency = 5; } def : WriteRes { let NumMicroOps = 2; } // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; //////////////////////////////////////////////////////////////////////////////// // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. //////////////////////////////////////////////////////////////////////////////// def : WriteRes; //////////////////////////////////////////////////////////////////////////////// // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteResExPair; //////////////////////////////////////////////////////////////////////////////// // Special case scheduling classes. //////////////////////////////////////////////////////////////////////////////// def : WriteRes { let Latency = 100; } def : WriteRes { let Latency = 100; } def : WriteRes; def PdWriteXLAT : SchedWriteRes<[PdEX01]> { let Latency = 6; } def : InstRW<[PdWriteXLAT], (instrs XLAT)>; def PdWriteLARrr : SchedWriteRes<[PdEX01]> { let Latency = 184; let NumMicroOps = 45; } def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr", "LSL(16|32|64)rr")>; // Nops don't have dependencies, so there's no actual latency, but we set this // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. def : WriteRes; //////////////////////////////////////////////////////////////////////////////// // Arithmetic. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteResExPair; def PdWriteLXADD : SchedWriteRes<[PdEX01]> { let Latency = 6; let NumMicroOps = 4; } def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>; def PdWriteBMI1 : SchedWriteRes<[PdEX01]> { let Latency = 2; let NumMicroOps = 2; } def : InstRW<[PdWriteBMI1], (instrs BLCFILL32rr, BLCFILL64rr, BLCI32rr, BLCI64rr, BLCIC32rr, BLCIC64rr, BLCMSK32rr, BLCMSK64rr, BLCS32rr, BLCS64rr, BLSFILL32rr, BLSFILL64rr, BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr, TZMSK32rr, TZMSK64rr)>; def PdWriteBMI1m : SchedWriteRes<[PdEX01]> { let Latency = 6; let NumMicroOps = 2; } def : InstRW<[PdWriteBMI1m], (instrs BLCFILL32rm, BLCFILL64rm, BLCI32rm, BLCI64rm, BLCIC32rm, BLCIC64rm, BLCMSK32rm, BLCMSK64rm, BLCS32rm, BLCS64rm, BLSFILL32rm, BLSFILL64rm, BLSIC32rm, BLSIC64rm, T1MSKC32rm, T1MSKC64rm, TZMSK32rm, TZMSK64rm)>; defm : PdWriteResExPair; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> { let Latency = 3; let NumMicroOps = 3; } def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> { let Latency = 3; let NumMicroOps = 5; } def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>; def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> { let Latency = 3; let NumMicroOps = 6; } def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm], (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>; def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> { let Latency = 3; let NumMicroOps = 18; } def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>; def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> { let Latency = 3; let NumMicroOps = 22; } def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>; def PdWriteXCHG16rr : SchedWriteRes<[PdEX1]> { let Latency = 2; let NumMicroOps = 2; } def : InstRW<[PdWriteXCHG16rr], (instrs XCHG16rr)>; def PdWriteXADD : SchedWriteRes<[PdEX1]> { let Latency = 2; let NumMicroOps = 4; } def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>; def PdWriteXADDm : SchedWriteRes<[PdEX1]> { let Latency = 6; let NumMicroOps = 4; } def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : X86WriteResUnsupported; // BMI2 MULX defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> { let Latency = 5; let ResourceCycles = [4]; let NumMicroOps = 5; } def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>; def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> { let Latency = 6; let ResourceCycles = [4]; let NumMicroOps = 7; } def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>; def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> { let Latency = 10; let ResourceCycles = [4]; let NumMicroOps = 11; } def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>; defm : PdWriteResExPair; // Conditional move. defm : PdWriteResExPair; // Conditional (CF + ZF flag) move. def : InstRW<[WriteCMOV2.Folded], (instrs CMOVG16rm, CMOVG32rm, CMOVG64rm, CMOVGE16rm, CMOVGE32rm, CMOVGE64rm, CMOVL16rm, CMOVL32rm, CMOVL64rm, CMOVLE16rm, CMOVLE32rm, CMOVLE64rm)>; defm : PdWriteRes; // x87 conditional move. def : WriteRes; // Setcc. def : WriteRes; def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> { let ResourceCycles = [2]; let NumMicroOps = 2; } def : InstRW<[PdWriteSETGEmSETGmSETLEmSETLm], (instrs SETGEm, SETGm, SETLEm, SETLm)>; defm : PdWriteRes; def WriteLAHF : SchedWriteRes<[PdEX01]> { let Latency = 2; let NumMicroOps = 4; } def : InstRW<[WriteLAHF], (instrs LAHF)>; def WriteSAHF : SchedWriteRes<[PdEX01]> { let Latency = 2; let NumMicroOps = 2; } def : InstRW<[WriteSAHF], (instrs SAHF)>; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; // This is for simple LEAs with one or two input operands. // FIXME: SAGU 3-operand LEA def : WriteRes { let NumMicroOps = 2; } // Bit counts. defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; // BMI1 BEXTR, BMI2 BZHI defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; //////////////////////////////////////////////////////////////////////////////// // Integer shifts and rotates. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> { let Latency = 12; let NumMicroOps = 26; } def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>; def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> { let Latency = 12; let NumMicroOps = 23; } def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>; def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> { let Latency = 11; let NumMicroOps = 24; } def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>; def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> { let Latency = 10; let NumMicroOps = 22; } def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>; def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> { let Latency = 10; let NumMicroOps = 19; } def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>; def PdWriteRCL32rCLRCL64rCL : SchedWriteRes<[PdEX01]> { let Latency = 7; let NumMicroOps = 17; } def : InstRW<[PdWriteRCL32rCLRCL64rCL], (instrs RCL32rCL, RCL64rCL)>; def PdWriteRCR64rCL : SchedWriteRes<[PdEX01]> { let Latency = 7; let NumMicroOps = 16; } def : InstRW<[PdWriteRCR64rCL], (instrs RCR64rCL)>; def PdWriteRCR32rCL : SchedWriteRes<[PdEX01]> { let Latency = 7; let NumMicroOps = 16; } def : InstRW<[PdWriteRCR32rCL ], (instrs RCR32rCL)>; def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> { let Latency = 7; let NumMicroOps = 15; } def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>; def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> { let Latency = 9; let NumMicroOps = 20; } def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>; def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> { let Latency = 11; let NumMicroOps = 21; } def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>; def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> { let Latency = 8; let NumMicroOps = 16; } def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>; def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> { let Latency = 13; let NumMicroOps = 25; } def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>; // SHLD/SHRD. defm : PdWriteRes; defm : PdWriteRes; def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> { let Latency = 3; let ResourceCycles = [6]; let NumMicroOps = 6; } def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>; def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> { let Latency = 4; let ResourceCycles = [8]; let NumMicroOps = 7; } def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL, SHLD32rrCL, SHRD32rrCL)>; defm : PdWriteRes; defm : PdWriteRes; //////////////////////////////////////////////////////////////////////////////// // Floating point. This covers both scalar and vector operations. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { let Latency = 2; let NumMicroOps = 2; } def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>; def PdWriteVMOVUPDYmrVMOVUPSYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { let NumMicroOps = 8; } def : InstRW<[PdWriteVMOVUPDYmrVMOVUPSYmr], (instrs VMOVUPDYmr, VMOVUPSYmr)>; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> { let Latency = 6; } def : InstRW<[PdWriteFCOMPm], (instrs FCOM32m, FCOM64m, FCOMP32m, FCOMP64m)>; def PdWriteTST_F_UCOM_FPPr : SchedWriteRes<[PdFPU1, PdFPFMA]>; def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> { let Latency = 25; let ResourceCycles = [1, 3]; let NumMicroOps = 17; } def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; def PdWriteVFRCZ : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 10; let NumMicroOps = 2; } def : InstRW<[PdWriteVFRCZ], (instrs VFRCZPDrr, VFRCZPSrr, VFRCZSDrr, VFRCZSSrr)>; def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 15; let NumMicroOps = 2; } def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm, VFRCZSDrm, VFRCZSSrm)>; def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 10; let ResourceCycles = [2, 1]; let NumMicroOps = 4; } def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>; def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 15; let ResourceCycles = [2, 1]; let NumMicroOps = 8; } def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 7; let NumMicroOps = 2; } def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 2; } def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>; def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 7; let NumMicroOps = 2; } def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>; def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 4; let NumMicroOps = 8; } def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>; def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 8; // 4 + 4 let NumMicroOps = 10; } def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>; //////////////////////////////////////////////////////////////////////////////// // Conversions. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 6; let NumMicroOps = 2; } def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>; // FIXME: f+3 ST, LD+STC latency defm : PdWriteResXMMPair; // FIXME: .Folded version is one NumMicroOp *less*.. defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; // FIXME: .Folded version is one NumMicroOp *less*.. def WriteCVTSI642SDrr : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 13; let NumMicroOps = 2; } def : InstRW<[WriteCVTSI642SDrr], (instrs CVTSI642SDrr, CVTSI642SSrr)>; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; def WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 6; let NumMicroOps = 2; } def : InstRW<[WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr, MMX_CVTPI2PDirr)>; def WriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 4; let NumMicroOps = 2; } def : InstRW<[WriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteRes; defm : PdWriteRes; defm : X86WriteResUnsupported; defm : PdWriteRes; defm : PdWriteRes; defm : X86WriteResUnsupported; //////////////////////////////////////////////////////////////////////////////// // Vector integer operations. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { let NumMicroOps = 8; } def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; def JWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]> { let Latency = 4; let ResourceCycles = [2, 1, 2, 1]; } def : InstRW<[JWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr, VPMACSSDQLrr)>; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; //////////////////////////////////////////////////////////////////////////////// // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> { let Latency = 3; } def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>; //////////////////////////////////////////////////////////////////////////////// // SSE42 String instructions. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; //////////////////////////////////////////////////////////////////////////////// // MOVMSK Instructions. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteRes; defm : PdWriteRes; defm : X86WriteResUnsupported; // defm : X86WriteResUnsupported; defm : PdWriteRes; //////////////////////////////////////////////////////////////////////////////// // AES Instructions. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteResXMMPair; defm : PdWriteResYMMPair; defm : X86WriteResPairUnsupported; defm : PdWriteResXMMPair; defm : PdWriteResXMMPair; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; def : InstRW<[WritePHAdd], (instrs PHADDDrr, PHSUBDrr, PHADDWrr, PHSUBWrr, PHADDSWrr, PHSUBSWrr, VPHADDDrr, VPHSUBDrr, VPHADDWrr, VPHSUBWrr, VPHADDSWrr, VPHSUBSWrr)>; def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm, PHADDWrm, PHSUBWrm, PHADDSWrm, PHSUBSWrm, VPHADDDrm, VPHSUBDrm, VPHADDWrm, VPHSUBWrm, VPHADDSWrm, VPHSUBSWrm)>; //////////////////////////////////////////////////////////////////////////////// // Carry-less multiplication instructions. //////////////////////////////////////////////////////////////////////////////// defm : PdWriteResXMMPair; def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> { let Latency = 13; let NumMicroOps = 6; } def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>; //////////////////////////////////////////////////////////////////////////////// // SSE4A instructions. //////////////////////////////////////////////////////////////////////////////// def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> { let Latency = 3; let ResourceCycles = [1, 4]; } def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; //////////////////////////////////////////////////////////////////////////////// // AVX instructions. //////////////////////////////////////////////////////////////////////////////// def PdWriteVBROADCASTYLd : SchedWriteRes<[PdLoad, PdFPU01, PdFPFMA]> { let Latency = 6; let ResourceCycles = [1, 2, 4]; let NumMicroOps = 2; } def : InstRW<[PdWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm, VBROADCASTSSYrm)>; def PdWriteVZEROALL : SchedWriteRes<[]> { let Latency = 90; let NumMicroOps = 32; } def : InstRW<[PdWriteVZEROALL], (instrs VZEROALL)>; def PdWriteVZEROUPPER : SchedWriteRes<[]> { let Latency = 46; let NumMicroOps = 16; } def : InstRW<[PdWriteVZEROUPPER], (instrs VZEROUPPER)>; /////////////////////////////////////////////////////////////////////////////// // SchedWriteVariant definitions. /////////////////////////////////////////////////////////////////////////////// def PdWriteZeroLatency : SchedWriteRes<[]> { let Latency = 0; } def PdWriteZeroIdiom : SchedWriteVariant<[ SchedVar, [PdWriteZeroLatency]>, SchedVar, [WriteALU]> ]>; def : InstRW<[PdWriteZeroIdiom], (instrs SUB32rr, SUB64rr, XOR32rr, XOR64rr)>; def PdWriteFZeroIdiom : SchedWriteVariant<[ SchedVar, [PdWriteZeroLatency]>, SchedVar, [WriteFLogic]> ]>; def : InstRW<[PdWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, ANDNPSrr, VANDNPSrr, ANDNPDrr, VANDNPDrr)>; // VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr "zero-idioms" have latency of 1. def PdWriteVZeroIdiomLogic : SchedWriteVariant<[ SchedVar, [PdWriteZeroLatency]>, SchedVar, [WriteVecLogic]> ]>; def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[ SchedVar, [PdWriteZeroLatency]>, SchedVar, [WriteVecLogicX]> ]>; def : InstRW<[PdWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, PANDNrr, VPANDNrr)>; def PdWriteVZeroIdiomALU : SchedWriteVariant<[ SchedVar, [PdWriteZeroLatency]>, SchedVar, [WriteVecALU]> ]>; def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr)>; def PdWriteVZeroIdiomALUX : SchedWriteVariant<[ SchedVar, [PdWriteZeroLatency]>, SchedVar, [WriteVecALUX]> ]>; def : InstRW<[PdWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, PSUBDrr, VPSUBDrr, PSUBQrr, VPSUBQrr, PSUBWrr, VPSUBWrr, PCMPGTBrr, VPCMPGTBrr, PCMPGTDrr, VPCMPGTDrr, PCMPGTWrr, VPCMPGTWrr)>; /////////////////////////////////////////////////////////////////////////////// // Dependency breaking instructions. /////////////////////////////////////////////////////////////////////////////// // VPCMPGTQ, but not PCMPGTQ! def : IsZeroIdiomFunction<[ // GPR Zero-idioms. DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, // MMX Zero-idioms. DepBreakingClass<[ MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr ], ZeroIdiomPredicate>, // SSE Zero-idioms. DepBreakingClass<[ // fp variants. XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, // int variants. PXORrr, PANDNrr, PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, PCMPGTBrr, PCMPGTDrr, PCMPGTWrr ], ZeroIdiomPredicate>, // AVX Zero-idioms. DepBreakingClass<[ // xmm fp variants. VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, // xmm int variants. VPXORrr, VPANDNrr, VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, // ymm variants. VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr ], ZeroIdiomPredicate> ]>; def : IsDepBreakingFunction<[ // GPR DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, // MMX DepBreakingClass<[ MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr ], ZeroIdiomPredicate>, // SSE DepBreakingClass<[ PCMPEQBrr, PCMPEQWrr, PCMPEQDrr // But not PCMPEQQrr. ], ZeroIdiomPredicate>, // AVX DepBreakingClass<[ VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr // But not VPCMPEQQrr. ], ZeroIdiomPredicate> ]>; } // SchedModel